1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45#include "stringlib/eq.h"
46
47#ifdef MS_WINDOWS
48#include <windows.h>
49#endif
50
51/*[clinic input]
52class str "PyObject *" "&PyUnicode_Type"
53[clinic start generated code]*/
54/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
55
56/*[python input]
57class Py_UCS4_converter(CConverter):
58 type = 'Py_UCS4'
59 converter = 'convert_uc'
60
61 def converter_init(self):
62 if self.default is not unspecified:
63 self.c_default = ascii(self.default)
64 if len(self.c_default) > 4 or self.c_default[0] != "'":
65 self.c_default = hex(ord(self.default))
66
67[python start generated code]*/
68/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
69
70/* --- Globals ------------------------------------------------------------
71
72NOTE: In the interpreter's initialization phase, some globals are currently
73 initialized dynamically as needed. In the process Unicode objects may
74 be created before the Unicode type is ready.
75
76*/
77
78
79#ifdef __cplusplus
80extern "C" {
81#endif
82
83/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
84#define MAX_UNICODE 0x10ffff
85
86#ifdef Py_DEBUG
87# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
88#else
89# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
90#endif
91
92#define _PyUnicode_UTF8(op) \
93 (((PyCompactUnicodeObject*)(op))->utf8)
94#define PyUnicode_UTF8(op) \
95 (assert(_PyUnicode_CHECK(op)), \
96 assert(PyUnicode_IS_READY(op)), \
97 PyUnicode_IS_COMPACT_ASCII(op) ? \
98 ((char*)((PyASCIIObject*)(op) + 1)) : \
99 _PyUnicode_UTF8(op))
100#define _PyUnicode_UTF8_LENGTH(op) \
101 (((PyCompactUnicodeObject*)(op))->utf8_length)
102#define PyUnicode_UTF8_LENGTH(op) \
103 (assert(_PyUnicode_CHECK(op)), \
104 assert(PyUnicode_IS_READY(op)), \
105 PyUnicode_IS_COMPACT_ASCII(op) ? \
106 ((PyASCIIObject*)(op))->length : \
107 _PyUnicode_UTF8_LENGTH(op))
108#define _PyUnicode_WSTR(op) \
109 (((PyASCIIObject*)(op))->wstr)
110#define _PyUnicode_WSTR_LENGTH(op) \
111 (((PyCompactUnicodeObject*)(op))->wstr_length)
112#define _PyUnicode_LENGTH(op) \
113 (((PyASCIIObject *)(op))->length)
114#define _PyUnicode_STATE(op) \
115 (((PyASCIIObject *)(op))->state)
116#define _PyUnicode_HASH(op) \
117 (((PyASCIIObject *)(op))->hash)
118#define _PyUnicode_KIND(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 ((PyASCIIObject *)(op))->state.kind)
121#define _PyUnicode_GET_LENGTH(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 ((PyASCIIObject *)(op))->length)
124#define _PyUnicode_DATA_ANY(op) \
125 (((PyUnicodeObject*)(op))->data.any)
126
127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : \
132 _PyUnicode_Ready(op)))
133
134#define _PyUnicode_SHARE_UTF8(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
137 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
138#define _PyUnicode_SHARE_WSTR(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
141
142/* true if the Unicode object has an allocated UTF-8 memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_UTF8_MEMORY(op) \
145 ((!PyUnicode_IS_COMPACT_ASCII(op) \
146 && _PyUnicode_UTF8(op) \
147 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148
149/* true if the Unicode object has an allocated wstr memory block
150 (not shared with other data) */
151#define _PyUnicode_HAS_WSTR_MEMORY(op) \
152 ((_PyUnicode_WSTR(op) && \
153 (!PyUnicode_IS_READY(op) || \
154 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
155
156/* Generic helper macro to convert characters of different types.
157 from_type and to_type have to be valid type names, begin and end
158 are pointers to the source characters which should be of type
159 "from_type *". to is a pointer of type "to_type *" and points to the
160 buffer where the result characters are written to. */
161#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162 do { \
163 to_type *_to = (to_type *)(to); \
164 const from_type *_iter = (from_type *)(begin); \
165 const from_type *_end = (from_type *)(end); \
166 Py_ssize_t n = (_end) - (_iter); \
167 const from_type *_unrolled_end = \
168 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
169 while (_iter < (_unrolled_end)) { \
170 _to[0] = (to_type) _iter[0]; \
171 _to[1] = (to_type) _iter[1]; \
172 _to[2] = (to_type) _iter[2]; \
173 _to[3] = (to_type) _iter[3]; \
174 _iter += 4; _to += 4; \
175 } \
176 while (_iter < (_end)) \
177 *_to++ = (to_type) *_iter++; \
178 } while (0)
179
180#ifdef MS_WINDOWS
181 /* On Windows, overallocate by 50% is the best factor */
182# define OVERALLOCATE_FACTOR 2
183#else
184 /* On Linux, overallocate by 25% is the best factor */
185# define OVERALLOCATE_FACTOR 4
186#endif
187
188/* This dictionary holds all interned unicode strings. Note that references
189 to strings in this dictionary are *not* counted in the string's ob_refcnt.
190 When the interned string reaches a refcnt of 0 the string deallocation
191 function will delete the reference from this dictionary.
192
193 Another way to look at this is that to say that the actual reference
194 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
195*/
196static PyObject *interned = NULL;
197
198/* The empty Unicode object is shared to improve performance. */
199static PyObject *unicode_empty = NULL;
200
201#define _Py_INCREF_UNICODE_EMPTY() \
202 do { \
203 if (unicode_empty != NULL) \
204 Py_INCREF(unicode_empty); \
205 else { \
206 unicode_empty = PyUnicode_New(0, 0); \
207 if (unicode_empty != NULL) { \
208 Py_INCREF(unicode_empty); \
209 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
210 } \
211 } \
212 } while (0)
213
214#define _Py_RETURN_UNICODE_EMPTY() \
215 do { \
216 _Py_INCREF_UNICODE_EMPTY(); \
217 return unicode_empty; \
218 } while (0)
219
220/* Forward declaration */
221static inline int
222_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
224/* List of static strings. */
225static _Py_Identifier *static_strings = NULL;
226
227/* Single character Unicode strings in the Latin-1 range are being
228 shared as well. */
229static PyObject *unicode_latin1[256] = {NULL};
230
231/* Fast detection of the most frequent whitespace characters */
232const unsigned char _Py_ascii_whitespace[] = {
233 0, 0, 0, 0, 0, 0, 0, 0,
234/* case 0x0009: * CHARACTER TABULATION */
235/* case 0x000A: * LINE FEED */
236/* case 0x000B: * LINE TABULATION */
237/* case 0x000C: * FORM FEED */
238/* case 0x000D: * CARRIAGE RETURN */
239 0, 1, 1, 1, 1, 1, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241/* case 0x001C: * FILE SEPARATOR */
242/* case 0x001D: * GROUP SEPARATOR */
243/* case 0x001E: * RECORD SEPARATOR */
244/* case 0x001F: * UNIT SEPARATOR */
245 0, 0, 0, 0, 1, 1, 1, 1,
246/* case 0x0020: * SPACE */
247 1, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251
252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0
260};
261
262/* forward */
263static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
264static PyObject* get_latin1_char(unsigned char ch);
265static int unicode_modifiable(PyObject *unicode);
266
267
268static PyObject *
269_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
270static PyObject *
271_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
272static PyObject *
273_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
274
275static PyObject *
276unicode_encode_call_errorhandler(const char *errors,
277 PyObject **errorHandler,const char *encoding, const char *reason,
278 PyObject *unicode, PyObject **exceptionObject,
279 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
280
281static void
282raise_encode_exception(PyObject **exceptionObject,
283 const char *encoding,
284 PyObject *unicode,
285 Py_ssize_t startpos, Py_ssize_t endpos,
286 const char *reason);
287
288/* Same for linebreaks */
289static const unsigned char ascii_linebreak[] = {
290 0, 0, 0, 0, 0, 0, 0, 0,
291/* 0x000A, * LINE FEED */
292/* 0x000B, * LINE TABULATION */
293/* 0x000C, * FORM FEED */
294/* 0x000D, * CARRIAGE RETURN */
295 0, 0, 1, 1, 1, 1, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297/* 0x001C, * FILE SEPARATOR */
298/* 0x001D, * GROUP SEPARATOR */
299/* 0x001E, * RECORD SEPARATOR */
300 0, 0, 0, 0, 1, 1, 1, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
314};
315
316static int convert_uc(PyObject *obj, void *addr);
317
318#include "clinic/unicodeobject.c.h"
319
320typedef enum {
321 _Py_ERROR_UNKNOWN=0,
322 _Py_ERROR_STRICT,
323 _Py_ERROR_SURROGATEESCAPE,
324 _Py_ERROR_REPLACE,
325 _Py_ERROR_IGNORE,
326 _Py_ERROR_BACKSLASHREPLACE,
327 _Py_ERROR_SURROGATEPASS,
328 _Py_ERROR_XMLCHARREFREPLACE,
329 _Py_ERROR_OTHER
330} _Py_error_handler;
331
332static _Py_error_handler
333get_error_handler(const char *errors)
334{
335 if (errors == NULL || strcmp(errors, "strict") == 0) {
336 return _Py_ERROR_STRICT;
337 }
338 if (strcmp(errors, "surrogateescape") == 0) {
339 return _Py_ERROR_SURROGATEESCAPE;
340 }
341 if (strcmp(errors, "replace") == 0) {
342 return _Py_ERROR_REPLACE;
343 }
344 if (strcmp(errors, "ignore") == 0) {
345 return _Py_ERROR_IGNORE;
346 }
347 if (strcmp(errors, "backslashreplace") == 0) {
348 return _Py_ERROR_BACKSLASHREPLACE;
349 }
350 if (strcmp(errors, "surrogatepass") == 0) {
351 return _Py_ERROR_SURROGATEPASS;
352 }
353 if (strcmp(errors, "xmlcharrefreplace") == 0) {
354 return _Py_ERROR_XMLCHARREFREPLACE;
355 }
356 return _Py_ERROR_OTHER;
357}
358
359/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
360 This function is kept for backward compatibility with the old API. */
361Py_UNICODE
362PyUnicode_GetMax(void)
363{
364#ifdef Py_UNICODE_WIDE
365 return 0x10FFFF;
366#else
367 /* This is actually an illegal character, so it should
368 not be passed to unichr. */
369 return 0xFFFF;
370#endif
371}
372
373#ifdef Py_DEBUG
374int
375_PyUnicode_CheckConsistency(PyObject *op, int check_content)
376{
377 PyASCIIObject *ascii;
378 unsigned int kind;
379
380 assert(PyUnicode_Check(op));
381
382 ascii = (PyASCIIObject *)op;
383 kind = ascii->state.kind;
384
385 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
386 assert(kind == PyUnicode_1BYTE_KIND);
387 assert(ascii->state.ready == 1);
388 }
389 else {
390 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
391 void *data;
392
393 if (ascii->state.compact == 1) {
394 data = compact + 1;
395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
398 assert(ascii->state.ascii == 0);
399 assert(ascii->state.ready == 1);
400 assert (compact->utf8 != data);
401 }
402 else {
403 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
404
405 data = unicode->data.any;
406 if (kind == PyUnicode_WCHAR_KIND) {
407 assert(ascii->length == 0);
408 assert(ascii->hash == -1);
409 assert(ascii->state.compact == 0);
410 assert(ascii->state.ascii == 0);
411 assert(ascii->state.ready == 0);
412 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
413 assert(ascii->wstr != NULL);
414 assert(data == NULL);
415 assert(compact->utf8 == NULL);
416 }
417 else {
418 assert(kind == PyUnicode_1BYTE_KIND
419 || kind == PyUnicode_2BYTE_KIND
420 || kind == PyUnicode_4BYTE_KIND);
421 assert(ascii->state.compact == 0);
422 assert(ascii->state.ready == 1);
423 assert(data != NULL);
424 if (ascii->state.ascii) {
425 assert (compact->utf8 == data);
426 assert (compact->utf8_length == ascii->length);
427 }
428 else
429 assert (compact->utf8 != data);
430 }
431 }
432 if (kind != PyUnicode_WCHAR_KIND) {
433 if (
434#if SIZEOF_WCHAR_T == 2
435 kind == PyUnicode_2BYTE_KIND
436#else
437 kind == PyUnicode_4BYTE_KIND
438#endif
439 )
440 {
441 assert(ascii->wstr == data);
442 assert(compact->wstr_length == ascii->length);
443 } else
444 assert(ascii->wstr != data);
445 }
446
447 if (compact->utf8 == NULL)
448 assert(compact->utf8_length == 0);
449 if (ascii->wstr == NULL)
450 assert(compact->wstr_length == 0);
451 }
452 /* check that the best kind is used */
453 if (check_content && kind != PyUnicode_WCHAR_KIND)
454 {
455 Py_ssize_t i;
456 Py_UCS4 maxchar = 0;
457 void *data;
458 Py_UCS4 ch;
459
460 data = PyUnicode_DATA(ascii);
461 for (i=0; i < ascii->length; i++)
462 {
463 ch = PyUnicode_READ(kind, data, i);
464 if (ch > maxchar)
465 maxchar = ch;
466 }
467 if (kind == PyUnicode_1BYTE_KIND) {
468 if (ascii->state.ascii == 0) {
469 assert(maxchar >= 128);
470 assert(maxchar <= 255);
471 }
472 else
473 assert(maxchar < 128);
474 }
475 else if (kind == PyUnicode_2BYTE_KIND) {
476 assert(maxchar >= 0x100);
477 assert(maxchar <= 0xFFFF);
478 }
479 else {
480 assert(maxchar >= 0x10000);
481 assert(maxchar <= MAX_UNICODE);
482 }
483 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
484 }
485 return 1;
486}
487#endif
488
489static PyObject*
490unicode_result_wchar(PyObject *unicode)
491{
492#ifndef Py_DEBUG
493 Py_ssize_t len;
494
495 len = _PyUnicode_WSTR_LENGTH(unicode);
496 if (len == 0) {
497 Py_DECREF(unicode);
498 _Py_RETURN_UNICODE_EMPTY();
499 }
500
501 if (len == 1) {
502 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
503 if ((Py_UCS4)ch < 256) {
504 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
505 Py_DECREF(unicode);
506 return latin1_char;
507 }
508 }
509
510 if (_PyUnicode_Ready(unicode) < 0) {
511 Py_DECREF(unicode);
512 return NULL;
513 }
514#else
515 assert(Py_REFCNT(unicode) == 1);
516
517 /* don't make the result ready in debug mode to ensure that the caller
518 makes the string ready before using it */
519 assert(_PyUnicode_CheckConsistency(unicode, 1));
520#endif
521 return unicode;
522}
523
524static PyObject*
525unicode_result_ready(PyObject *unicode)
526{
527 Py_ssize_t length;
528
529 length = PyUnicode_GET_LENGTH(unicode);
530 if (length == 0) {
531 if (unicode != unicode_empty) {
532 Py_DECREF(unicode);
533 _Py_RETURN_UNICODE_EMPTY();
534 }
535 return unicode_empty;
536 }
537
538 if (length == 1) {
539 void *data = PyUnicode_DATA(unicode);
540 int kind = PyUnicode_KIND(unicode);
541 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
542 if (ch < 256) {
543 PyObject *latin1_char = unicode_latin1[ch];
544 if (latin1_char != NULL) {
545 if (unicode != latin1_char) {
546 Py_INCREF(latin1_char);
547 Py_DECREF(unicode);
548 }
549 return latin1_char;
550 }
551 else {
552 assert(_PyUnicode_CheckConsistency(unicode, 1));
553 Py_INCREF(unicode);
554 unicode_latin1[ch] = unicode;
555 return unicode;
556 }
557 }
558 }
559
560 assert(_PyUnicode_CheckConsistency(unicode, 1));
561 return unicode;
562}
563
564static PyObject*
565unicode_result(PyObject *unicode)
566{
567 assert(_PyUnicode_CHECK(unicode));
568 if (PyUnicode_IS_READY(unicode))
569 return unicode_result_ready(unicode);
570 else
571 return unicode_result_wchar(unicode);
572}
573
574static PyObject*
575unicode_result_unchanged(PyObject *unicode)
576{
577 if (PyUnicode_CheckExact(unicode)) {
578 if (PyUnicode_READY(unicode) == -1)
579 return NULL;
580 Py_INCREF(unicode);
581 return unicode;
582 }
583 else
584 /* Subtype -- return genuine unicode string with the same value. */
585 return _PyUnicode_Copy(unicode);
586}
587
588/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
589 ASCII, Latin1, UTF-8, etc. */
590static char*
591backslashreplace(_PyBytesWriter *writer, char *str,
592 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
593{
594 Py_ssize_t size, i;
595 Py_UCS4 ch;
596 enum PyUnicode_Kind kind;
597 void *data;
598
599 assert(PyUnicode_IS_READY(unicode));
600 kind = PyUnicode_KIND(unicode);
601 data = PyUnicode_DATA(unicode);
602
603 size = 0;
604 /* determine replacement size */
605 for (i = collstart; i < collend; ++i) {
606 Py_ssize_t incr;
607
608 ch = PyUnicode_READ(kind, data, i);
609 if (ch < 0x100)
610 incr = 2+2;
611 else if (ch < 0x10000)
612 incr = 2+4;
613 else {
614 assert(ch <= MAX_UNICODE);
615 incr = 2+8;
616 }
617 if (size > PY_SSIZE_T_MAX - incr) {
618 PyErr_SetString(PyExc_OverflowError,
619 "encoded result is too long for a Python string");
620 return NULL;
621 }
622 size += incr;
623 }
624
625 str = _PyBytesWriter_Prepare(writer, str, size);
626 if (str == NULL)
627 return NULL;
628
629 /* generate replacement */
630 for (i = collstart; i < collend; ++i) {
631 ch = PyUnicode_READ(kind, data, i);
632 *str++ = '\\';
633 if (ch >= 0x00010000) {
634 *str++ = 'U';
635 *str++ = Py_hexdigits[(ch>>28)&0xf];
636 *str++ = Py_hexdigits[(ch>>24)&0xf];
637 *str++ = Py_hexdigits[(ch>>20)&0xf];
638 *str++ = Py_hexdigits[(ch>>16)&0xf];
639 *str++ = Py_hexdigits[(ch>>12)&0xf];
640 *str++ = Py_hexdigits[(ch>>8)&0xf];
641 }
642 else if (ch >= 0x100) {
643 *str++ = 'u';
644 *str++ = Py_hexdigits[(ch>>12)&0xf];
645 *str++ = Py_hexdigits[(ch>>8)&0xf];
646 }
647 else
648 *str++ = 'x';
649 *str++ = Py_hexdigits[(ch>>4)&0xf];
650 *str++ = Py_hexdigits[ch&0xf];
651 }
652 return str;
653}
654
655/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
656 ASCII, Latin1, UTF-8, etc. */
657static char*
658xmlcharrefreplace(_PyBytesWriter *writer, char *str,
659 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
660{
661 Py_ssize_t size, i;
662 Py_UCS4 ch;
663 enum PyUnicode_Kind kind;
664 void *data;
665
666 assert(PyUnicode_IS_READY(unicode));
667 kind = PyUnicode_KIND(unicode);
668 data = PyUnicode_DATA(unicode);
669
670 size = 0;
671 /* determine replacement size */
672 for (i = collstart; i < collend; ++i) {
673 Py_ssize_t incr;
674
675 ch = PyUnicode_READ(kind, data, i);
676 if (ch < 10)
677 incr = 2+1+1;
678 else if (ch < 100)
679 incr = 2+2+1;
680 else if (ch < 1000)
681 incr = 2+3+1;
682 else if (ch < 10000)
683 incr = 2+4+1;
684 else if (ch < 100000)
685 incr = 2+5+1;
686 else if (ch < 1000000)
687 incr = 2+6+1;
688 else {
689 assert(ch <= MAX_UNICODE);
690 incr = 2+7+1;
691 }
692 if (size > PY_SSIZE_T_MAX - incr) {
693 PyErr_SetString(PyExc_OverflowError,
694 "encoded result is too long for a Python string");
695 return NULL;
696 }
697 size += incr;
698 }
699
700 str = _PyBytesWriter_Prepare(writer, str, size);
701 if (str == NULL)
702 return NULL;
703
704 /* generate replacement */
705 for (i = collstart; i < collend; ++i) {
706 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
707 }
708 return str;
709}
710
711/* --- Bloom Filters ----------------------------------------------------- */
712
713/* stuff to implement simple "bloom filters" for Unicode characters.
714 to keep things simple, we use a single bitmask, using the least 5
715 bits from each unicode characters as the bit index. */
716
717/* the linebreak mask is set up by Unicode_Init below */
718
719#if LONG_BIT >= 128
720#define BLOOM_WIDTH 128
721#elif LONG_BIT >= 64
722#define BLOOM_WIDTH 64
723#elif LONG_BIT >= 32
724#define BLOOM_WIDTH 32
725#else
726#error "LONG_BIT is smaller than 32"
727#endif
728
729#define BLOOM_MASK unsigned long
730
731static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
732
733#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
734
735#define BLOOM_LINEBREAK(ch) \
736 ((ch) < 128U ? ascii_linebreak[(ch)] : \
737 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
738
739static inline BLOOM_MASK
740make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
741{
742#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
743 do { \
744 TYPE *data = (TYPE *)PTR; \
745 TYPE *end = data + LEN; \
746 Py_UCS4 ch; \
747 for (; data != end; data++) { \
748 ch = *data; \
749 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
750 } \
751 break; \
752 } while (0)
753
754 /* calculate simple bloom-style bitmask for a given unicode string */
755
756 BLOOM_MASK mask;
757
758 mask = 0;
759 switch (kind) {
760 case PyUnicode_1BYTE_KIND:
761 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
762 break;
763 case PyUnicode_2BYTE_KIND:
764 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
765 break;
766 case PyUnicode_4BYTE_KIND:
767 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
768 break;
769 default:
770 assert(0);
771 }
772 return mask;
773
774#undef BLOOM_UPDATE
775}
776
777static int
778ensure_unicode(PyObject *obj)
779{
780 if (!PyUnicode_Check(obj)) {
781 PyErr_Format(PyExc_TypeError,
782 "must be str, not %.100s",
783 Py_TYPE(obj)->tp_name);
784 return -1;
785 }
786 return PyUnicode_READY(obj);
787}
788
789/* Compilation of templated routines */
790
791#include "stringlib/asciilib.h"
792#include "stringlib/fastsearch.h"
793#include "stringlib/partition.h"
794#include "stringlib/split.h"
795#include "stringlib/count.h"
796#include "stringlib/find.h"
797#include "stringlib/find_max_char.h"
798#include "stringlib/localeutil.h"
799#include "stringlib/undef.h"
800
801#include "stringlib/ucs1lib.h"
802#include "stringlib/fastsearch.h"
803#include "stringlib/partition.h"
804#include "stringlib/split.h"
805#include "stringlib/count.h"
806#include "stringlib/find.h"
807#include "stringlib/replace.h"
808#include "stringlib/find_max_char.h"
809#include "stringlib/localeutil.h"
810#include "stringlib/undef.h"
811
812#include "stringlib/ucs2lib.h"
813#include "stringlib/fastsearch.h"
814#include "stringlib/partition.h"
815#include "stringlib/split.h"
816#include "stringlib/count.h"
817#include "stringlib/find.h"
818#include "stringlib/replace.h"
819#include "stringlib/find_max_char.h"
820#include "stringlib/localeutil.h"
821#include "stringlib/undef.h"
822
823#include "stringlib/ucs4lib.h"
824#include "stringlib/fastsearch.h"
825#include "stringlib/partition.h"
826#include "stringlib/split.h"
827#include "stringlib/count.h"
828#include "stringlib/find.h"
829#include "stringlib/replace.h"
830#include "stringlib/find_max_char.h"
831#include "stringlib/localeutil.h"
832#include "stringlib/undef.h"
833
834#include "stringlib/unicodedefs.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/count.h"
837#include "stringlib/find.h"
838#include "stringlib/undef.h"
839
840/* --- Unicode Object ----------------------------------------------------- */
841
842static PyObject *
843fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
844
845static inline Py_ssize_t
846findchar(const void *s, int kind,
847 Py_ssize_t size, Py_UCS4 ch,
848 int direction)
849{
850 switch (kind) {
851 case PyUnicode_1BYTE_KIND:
852 if ((Py_UCS1) ch != ch)
853 return -1;
854 if (direction > 0)
855 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
856 else
857 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
858 case PyUnicode_2BYTE_KIND:
859 if ((Py_UCS2) ch != ch)
860 return -1;
861 if (direction > 0)
862 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
863 else
864 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
865 case PyUnicode_4BYTE_KIND:
866 if (direction > 0)
867 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
868 else
869 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
870 default:
871 assert(0);
872 return -1;
873 }
874}
875
876#ifdef Py_DEBUG
877/* Fill the data of a Unicode string with invalid characters to detect bugs
878 earlier.
879
880 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882 invalid character in Unicode 6.0. */
883static void
884unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885{
886 int kind = PyUnicode_KIND(unicode);
887 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889 if (length <= old_length)
890 return;
891 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892}
893#endif
894
895static PyObject*
896resize_compact(PyObject *unicode, Py_ssize_t length)
897{
898 Py_ssize_t char_size;
899 Py_ssize_t struct_size;
900 Py_ssize_t new_size;
901 int share_wstr;
902 PyObject *new_unicode;
903#ifdef Py_DEBUG
904 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905#endif
906
907 assert(unicode_modifiable(unicode));
908 assert(PyUnicode_IS_READY(unicode));
909 assert(PyUnicode_IS_COMPACT(unicode));
910
911 char_size = PyUnicode_KIND(unicode);
912 if (PyUnicode_IS_ASCII(unicode))
913 struct_size = sizeof(PyASCIIObject);
914 else
915 struct_size = sizeof(PyCompactUnicodeObject);
916 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
917
918 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919 PyErr_NoMemory();
920 return NULL;
921 }
922 new_size = (struct_size + (length + 1) * char_size);
923
924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925 PyObject_DEL(_PyUnicode_UTF8(unicode));
926 _PyUnicode_UTF8(unicode) = NULL;
927 _PyUnicode_UTF8_LENGTH(unicode) = 0;
928 }
929 _Py_DEC_REFTOTAL;
930 _Py_ForgetReference(unicode);
931
932 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
933 if (new_unicode == NULL) {
934 _Py_NewReference(unicode);
935 PyErr_NoMemory();
936 return NULL;
937 }
938 unicode = new_unicode;
939 _Py_NewReference(unicode);
940
941 _PyUnicode_LENGTH(unicode) = length;
942 if (share_wstr) {
943 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
944 if (!PyUnicode_IS_ASCII(unicode))
945 _PyUnicode_WSTR_LENGTH(unicode) = length;
946 }
947 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_WSTR(unicode));
949 _PyUnicode_WSTR(unicode) = NULL;
950 if (!PyUnicode_IS_ASCII(unicode))
951 _PyUnicode_WSTR_LENGTH(unicode) = 0;
952 }
953#ifdef Py_DEBUG
954 unicode_fill_invalid(unicode, old_length);
955#endif
956 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957 length, 0);
958 assert(_PyUnicode_CheckConsistency(unicode, 0));
959 return unicode;
960}
961
962static int
963resize_inplace(PyObject *unicode, Py_ssize_t length)
964{
965 wchar_t *wstr;
966 Py_ssize_t new_size;
967 assert(!PyUnicode_IS_COMPACT(unicode));
968 assert(Py_REFCNT(unicode) == 1);
969
970 if (PyUnicode_IS_READY(unicode)) {
971 Py_ssize_t char_size;
972 int share_wstr, share_utf8;
973 void *data;
974#ifdef Py_DEBUG
975 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976#endif
977
978 data = _PyUnicode_DATA_ANY(unicode);
979 char_size = PyUnicode_KIND(unicode);
980 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
982
983 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984 PyErr_NoMemory();
985 return -1;
986 }
987 new_size = (length + 1) * char_size;
988
989 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990 {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995
996 data = (PyObject *)PyObject_REALLOC(data, new_size);
997 if (data == NULL) {
998 PyErr_NoMemory();
999 return -1;
1000 }
1001 _PyUnicode_DATA_ANY(unicode) = data;
1002 if (share_wstr) {
1003 _PyUnicode_WSTR(unicode) = data;
1004 _PyUnicode_WSTR_LENGTH(unicode) = length;
1005 }
1006 if (share_utf8) {
1007 _PyUnicode_UTF8(unicode) = data;
1008 _PyUnicode_UTF8_LENGTH(unicode) = length;
1009 }
1010 _PyUnicode_LENGTH(unicode) = length;
1011 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1012#ifdef Py_DEBUG
1013 unicode_fill_invalid(unicode, old_length);
1014#endif
1015 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1016 assert(_PyUnicode_CheckConsistency(unicode, 0));
1017 return 0;
1018 }
1019 }
1020 assert(_PyUnicode_WSTR(unicode) != NULL);
1021
1022 /* check for integer overflow */
1023 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1024 PyErr_NoMemory();
1025 return -1;
1026 }
1027 new_size = sizeof(wchar_t) * (length + 1);
1028 wstr = _PyUnicode_WSTR(unicode);
1029 wstr = PyObject_REALLOC(wstr, new_size);
1030 if (!wstr) {
1031 PyErr_NoMemory();
1032 return -1;
1033 }
1034 _PyUnicode_WSTR(unicode) = wstr;
1035 _PyUnicode_WSTR(unicode)[length] = 0;
1036 _PyUnicode_WSTR_LENGTH(unicode) = length;
1037 assert(_PyUnicode_CheckConsistency(unicode, 0));
1038 return 0;
1039}
1040
1041static PyObject*
1042resize_copy(PyObject *unicode, Py_ssize_t length)
1043{
1044 Py_ssize_t copy_length;
1045 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1046 PyObject *copy;
1047
1048 assert(PyUnicode_IS_READY(unicode));
1049
1050 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051 if (copy == NULL)
1052 return NULL;
1053
1054 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1055 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1056 return copy;
1057 }
1058 else {
1059 PyObject *w;
1060
1061 w = (PyObject*)_PyUnicode_New(length);
1062 if (w == NULL)
1063 return NULL;
1064 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065 copy_length = Py_MIN(copy_length, length);
1066 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1067 copy_length * sizeof(wchar_t));
1068 return w;
1069 }
1070}
1071
1072/* We allocate one more byte to make sure the string is
1073 Ux0000 terminated; some code (e.g. new_identifier)
1074 relies on that.
1075
1076 XXX This allocator could further be enhanced by assuring that the
1077 free list never reduces its size below 1.
1078
1079*/
1080
1081static PyUnicodeObject *
1082_PyUnicode_New(Py_ssize_t length)
1083{
1084 PyUnicodeObject *unicode;
1085 size_t new_size;
1086
1087 /* Optimization for empty strings */
1088 if (length == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
1090 return (PyUnicodeObject*)unicode_empty;
1091 }
1092
1093 /* Ensure we won't overflow the size. */
1094 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1095 return (PyUnicodeObject *)PyErr_NoMemory();
1096 }
1097 if (length < 0) {
1098 PyErr_SetString(PyExc_SystemError,
1099 "Negative size passed to _PyUnicode_New");
1100 return NULL;
1101 }
1102
1103 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104 if (unicode == NULL)
1105 return NULL;
1106 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1107
1108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
1111 _PyUnicode_STATE(unicode).kind = 0;
1112 _PyUnicode_STATE(unicode).compact = 0;
1113 _PyUnicode_STATE(unicode).ready = 0;
1114 _PyUnicode_STATE(unicode).ascii = 0;
1115 _PyUnicode_DATA_ANY(unicode) = NULL;
1116 _PyUnicode_LENGTH(unicode) = 0;
1117 _PyUnicode_UTF8(unicode) = NULL;
1118 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119
1120 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121 if (!_PyUnicode_WSTR(unicode)) {
1122 Py_DECREF(unicode);
1123 PyErr_NoMemory();
1124 return NULL;
1125 }
1126
1127 /* Initialize the first element to guard against cases where
1128 * the caller fails before initializing str -- unicode_resize()
1129 * reads str[0], and the Keep-Alive optimization can keep memory
1130 * allocated for str alive across a call to unicode_dealloc(unicode).
1131 * We don't want unicode_resize to read uninitialized memory in
1132 * that case.
1133 */
1134 _PyUnicode_WSTR(unicode)[0] = 0;
1135 _PyUnicode_WSTR(unicode)[length] = 0;
1136
1137 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1138 return unicode;
1139}
1140
1141static const char*
1142unicode_kind_name(PyObject *unicode)
1143{
1144 /* don't check consistency: unicode_kind_name() is called from
1145 _PyUnicode_Dump() */
1146 if (!PyUnicode_IS_COMPACT(unicode))
1147 {
1148 if (!PyUnicode_IS_READY(unicode))
1149 return "wstr";
1150 switch (PyUnicode_KIND(unicode))
1151 {
1152 case PyUnicode_1BYTE_KIND:
1153 if (PyUnicode_IS_ASCII(unicode))
1154 return "legacy ascii";
1155 else
1156 return "legacy latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "legacy UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "legacy UCS4";
1161 default:
1162 return "<legacy invalid kind>";
1163 }
1164 }
1165 assert(PyUnicode_IS_READY(unicode));
1166 switch (PyUnicode_KIND(unicode)) {
1167 case PyUnicode_1BYTE_KIND:
1168 if (PyUnicode_IS_ASCII(unicode))
1169 return "ascii";
1170 else
1171 return "latin1";
1172 case PyUnicode_2BYTE_KIND:
1173 return "UCS2";
1174 case PyUnicode_4BYTE_KIND:
1175 return "UCS4";
1176 default:
1177 return "<invalid compact kind>";
1178 }
1179}
1180
1181#ifdef Py_DEBUG
1182/* Functions wrapping macros for use in debugger */
1183char *_PyUnicode_utf8(void *unicode){
1184 return PyUnicode_UTF8(unicode);
1185}
1186
1187void *_PyUnicode_compact_data(void *unicode) {
1188 return _PyUnicode_COMPACT_DATA(unicode);
1189}
1190void *_PyUnicode_data(void *unicode){
1191 printf("obj %p\n", unicode);
1192 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197 return PyUnicode_DATA(unicode);
1198}
1199
1200void
1201_PyUnicode_Dump(PyObject *op)
1202{
1203 PyASCIIObject *ascii = (PyASCIIObject *)op;
1204 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206 void *data;
1207
1208 if (ascii->state.compact)
1209 {
1210 if (ascii->state.ascii)
1211 data = (ascii + 1);
1212 else
1213 data = (compact + 1);
1214 }
1215 else
1216 data = unicode->data.any;
1217 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218 unicode_kind_name(op), ascii->length);
1219
1220 if (ascii->wstr == data)
1221 printf("shared ");
1222 printf("wstr=%p", ascii->wstr);
1223
1224 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1225 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1226 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227 printf("shared ");
1228 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229 compact->utf8, compact->utf8_length);
1230 }
1231 printf(", data=%p\n", data);
1232}
1233#endif
1234
1235PyObject *
1236PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237{
1238 PyObject *obj;
1239 PyCompactUnicodeObject *unicode;
1240 void *data;
1241 enum PyUnicode_Kind kind;
1242 int is_sharing, is_ascii;
1243 Py_ssize_t char_size;
1244 Py_ssize_t struct_size;
1245
1246 /* Optimization for empty strings */
1247 if (size == 0 && unicode_empty != NULL) {
1248 Py_INCREF(unicode_empty);
1249 return unicode_empty;
1250 }
1251
1252 is_ascii = 0;
1253 is_sharing = 0;
1254 struct_size = sizeof(PyCompactUnicodeObject);
1255 if (maxchar < 128) {
1256 kind = PyUnicode_1BYTE_KIND;
1257 char_size = 1;
1258 is_ascii = 1;
1259 struct_size = sizeof(PyASCIIObject);
1260 }
1261 else if (maxchar < 256) {
1262 kind = PyUnicode_1BYTE_KIND;
1263 char_size = 1;
1264 }
1265 else if (maxchar < 65536) {
1266 kind = PyUnicode_2BYTE_KIND;
1267 char_size = 2;
1268 if (sizeof(wchar_t) == 2)
1269 is_sharing = 1;
1270 }
1271 else {
1272 if (maxchar > MAX_UNICODE) {
1273 PyErr_SetString(PyExc_SystemError,
1274 "invalid maximum character passed to PyUnicode_New");
1275 return NULL;
1276 }
1277 kind = PyUnicode_4BYTE_KIND;
1278 char_size = 4;
1279 if (sizeof(wchar_t) == 4)
1280 is_sharing = 1;
1281 }
1282
1283 /* Ensure we won't overflow the size. */
1284 if (size < 0) {
1285 PyErr_SetString(PyExc_SystemError,
1286 "Negative size passed to PyUnicode_New");
1287 return NULL;
1288 }
1289 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290 return PyErr_NoMemory();
1291
1292 /* Duplicated allocation code from _PyObject_New() instead of a call to
1293 * PyObject_New() so we are able to allocate space for the object and
1294 * it's data buffer.
1295 */
1296 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297 if (obj == NULL)
1298 return PyErr_NoMemory();
1299 obj = PyObject_INIT(obj, &PyUnicode_Type);
1300 if (obj == NULL)
1301 return NULL;
1302
1303 unicode = (PyCompactUnicodeObject *)obj;
1304 if (is_ascii)
1305 data = ((PyASCIIObject*)obj) + 1;
1306 else
1307 data = unicode + 1;
1308 _PyUnicode_LENGTH(unicode) = size;
1309 _PyUnicode_HASH(unicode) = -1;
1310 _PyUnicode_STATE(unicode).interned = 0;
1311 _PyUnicode_STATE(unicode).kind = kind;
1312 _PyUnicode_STATE(unicode).compact = 1;
1313 _PyUnicode_STATE(unicode).ready = 1;
1314 _PyUnicode_STATE(unicode).ascii = is_ascii;
1315 if (is_ascii) {
1316 ((char*)data)[size] = 0;
1317 _PyUnicode_WSTR(unicode) = NULL;
1318 }
1319 else if (kind == PyUnicode_1BYTE_KIND) {
1320 ((char*)data)[size] = 0;
1321 _PyUnicode_WSTR(unicode) = NULL;
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1323 unicode->utf8 = NULL;
1324 unicode->utf8_length = 0;
1325 }
1326 else {
1327 unicode->utf8 = NULL;
1328 unicode->utf8_length = 0;
1329 if (kind == PyUnicode_2BYTE_KIND)
1330 ((Py_UCS2*)data)[size] = 0;
1331 else /* kind == PyUnicode_4BYTE_KIND */
1332 ((Py_UCS4*)data)[size] = 0;
1333 if (is_sharing) {
1334 _PyUnicode_WSTR_LENGTH(unicode) = size;
1335 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336 }
1337 else {
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 }
1341 }
1342#ifdef Py_DEBUG
1343 unicode_fill_invalid((PyObject*)unicode, 0);
1344#endif
1345 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1346 return obj;
1347}
1348
1349#if SIZEOF_WCHAR_T == 2
1350/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351 will decode surrogate pairs, the other conversions are implemented as macros
1352 for efficiency.
1353
1354 This function assumes that unicode can hold one more code point than wstr
1355 characters for a terminating null character. */
1356static void
1357unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1358 PyObject *unicode)
1359{
1360 const wchar_t *iter;
1361 Py_UCS4 *ucs4_out;
1362
1363 assert(unicode != NULL);
1364 assert(_PyUnicode_CHECK(unicode));
1365 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367
1368 for (iter = begin; iter < end; ) {
1369 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370 _PyUnicode_GET_LENGTH(unicode)));
1371 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372 && (iter+1) < end
1373 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1374 {
1375 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1376 iter += 2;
1377 }
1378 else {
1379 *ucs4_out++ = *iter;
1380 iter++;
1381 }
1382 }
1383 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384 _PyUnicode_GET_LENGTH(unicode)));
1385
1386}
1387#endif
1388
1389static int
1390unicode_check_modifiable(PyObject *unicode)
1391{
1392 if (!unicode_modifiable(unicode)) {
1393 PyErr_SetString(PyExc_SystemError,
1394 "Cannot modify a string currently used");
1395 return -1;
1396 }
1397 return 0;
1398}
1399
1400static int
1401_copy_characters(PyObject *to, Py_ssize_t to_start,
1402 PyObject *from, Py_ssize_t from_start,
1403 Py_ssize_t how_many, int check_maxchar)
1404{
1405 unsigned int from_kind, to_kind;
1406 void *from_data, *to_data;
1407
1408 assert(0 <= how_many);
1409 assert(0 <= from_start);
1410 assert(0 <= to_start);
1411 assert(PyUnicode_Check(from));
1412 assert(PyUnicode_IS_READY(from));
1413 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1414
1415 assert(PyUnicode_Check(to));
1416 assert(PyUnicode_IS_READY(to));
1417 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418
1419 if (how_many == 0)
1420 return 0;
1421
1422 from_kind = PyUnicode_KIND(from);
1423 from_data = PyUnicode_DATA(from);
1424 to_kind = PyUnicode_KIND(to);
1425 to_data = PyUnicode_DATA(to);
1426
1427#ifdef Py_DEBUG
1428 if (!check_maxchar
1429 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430 {
1431 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432 Py_UCS4 ch;
1433 Py_ssize_t i;
1434 for (i=0; i < how_many; i++) {
1435 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436 assert(ch <= to_maxchar);
1437 }
1438 }
1439#endif
1440
1441 if (from_kind == to_kind) {
1442 if (check_maxchar
1443 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444 {
1445 /* Writing Latin-1 characters into an ASCII string requires to
1446 check that all written characters are pure ASCII */
1447 Py_UCS4 max_char;
1448 max_char = ucs1lib_find_max_char(from_data,
1449 (Py_UCS1*)from_data + how_many);
1450 if (max_char >= 128)
1451 return -1;
1452 }
1453 memcpy((char*)to_data + to_kind * to_start,
1454 (char*)from_data + from_kind * from_start,
1455 to_kind * how_many);
1456 }
1457 else if (from_kind == PyUnicode_1BYTE_KIND
1458 && to_kind == PyUnicode_2BYTE_KIND)
1459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS2,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_2BYTE_DATA(to) + to_start
1465 );
1466 }
1467 else if (from_kind == PyUnicode_1BYTE_KIND
1468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS1, Py_UCS4,
1472 PyUnicode_1BYTE_DATA(from) + from_start,
1473 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
1476 }
1477 else if (from_kind == PyUnicode_2BYTE_KIND
1478 && to_kind == PyUnicode_4BYTE_KIND)
1479 {
1480 _PyUnicode_CONVERT_BYTES(
1481 Py_UCS2, Py_UCS4,
1482 PyUnicode_2BYTE_DATA(from) + from_start,
1483 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484 PyUnicode_4BYTE_DATA(to) + to_start
1485 );
1486 }
1487 else {
1488 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489
1490 if (!check_maxchar) {
1491 if (from_kind == PyUnicode_2BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS2, Py_UCS1,
1496 PyUnicode_2BYTE_DATA(from) + from_start,
1497 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_1BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS1,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_1BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else if (from_kind == PyUnicode_4BYTE_KIND
1512 && to_kind == PyUnicode_2BYTE_KIND)
1513 {
1514 _PyUnicode_CONVERT_BYTES(
1515 Py_UCS4, Py_UCS2,
1516 PyUnicode_4BYTE_DATA(from) + from_start,
1517 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518 PyUnicode_2BYTE_DATA(to) + to_start
1519 );
1520 }
1521 else {
1522 assert(0);
1523 return -1;
1524 }
1525 }
1526 else {
1527 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1528 Py_UCS4 ch;
1529 Py_ssize_t i;
1530
1531 for (i=0; i < how_many; i++) {
1532 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1533 if (ch > to_maxchar)
1534 return -1;
1535 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1536 }
1537 }
1538 }
1539 return 0;
1540}
1541
1542void
1543_PyUnicode_FastCopyCharacters(
1544 PyObject *to, Py_ssize_t to_start,
1545 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1546{
1547 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1548}
1549
1550Py_ssize_t
1551PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many)
1554{
1555 int err;
1556
1557 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561
1562 if (PyUnicode_READY(from) == -1)
1563 return -1;
1564 if (PyUnicode_READY(to) == -1)
1565 return -1;
1566
1567 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1568 PyErr_SetString(PyExc_IndexError, "string index out of range");
1569 return -1;
1570 }
1571 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1572 PyErr_SetString(PyExc_IndexError, "string index out of range");
1573 return -1;
1574 }
1575 if (how_many < 0) {
1576 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1577 return -1;
1578 }
1579 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1580 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1581 PyErr_Format(PyExc_SystemError,
1582 "Cannot write %zi characters at %zi "
1583 "in a string of %zi characters",
1584 how_many, to_start, PyUnicode_GET_LENGTH(to));
1585 return -1;
1586 }
1587
1588 if (how_many == 0)
1589 return 0;
1590
1591 if (unicode_check_modifiable(to))
1592 return -1;
1593
1594 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1595 if (err) {
1596 PyErr_Format(PyExc_SystemError,
1597 "Cannot copy %s characters "
1598 "into a string of %s characters",
1599 unicode_kind_name(from),
1600 unicode_kind_name(to));
1601 return -1;
1602 }
1603 return how_many;
1604}
1605
1606/* Find the maximum code point and count the number of surrogate pairs so a
1607 correct string length can be computed before converting a string to UCS4.
1608 This function counts single surrogates as a character and not as a pair.
1609
1610 Return 0 on success, or -1 on error. */
1611static int
1612find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1613 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1614{
1615 const wchar_t *iter;
1616 Py_UCS4 ch;
1617
1618 assert(num_surrogates != NULL && maxchar != NULL);
1619 *num_surrogates = 0;
1620 *maxchar = 0;
1621
1622 for (iter = begin; iter < end; ) {
1623#if SIZEOF_WCHAR_T == 2
1624 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1625 && (iter+1) < end
1626 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1627 {
1628 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1629 ++(*num_surrogates);
1630 iter += 2;
1631 }
1632 else
1633#endif
1634 {
1635 ch = *iter;
1636 iter++;
1637 }
1638 if (ch > *maxchar) {
1639 *maxchar = ch;
1640 if (*maxchar > MAX_UNICODE) {
1641 PyErr_Format(PyExc_ValueError,
1642 "character U+%x is not in range [U+0000; U+10ffff]",
1643 ch);
1644 return -1;
1645 }
1646 }
1647 }
1648 return 0;
1649}
1650
1651int
1652_PyUnicode_Ready(PyObject *unicode)
1653{
1654 wchar_t *end;
1655 Py_UCS4 maxchar = 0;
1656 Py_ssize_t num_surrogates;
1657#if SIZEOF_WCHAR_T == 2
1658 Py_ssize_t length_wo_surrogates;
1659#endif
1660
1661 /* _PyUnicode_Ready() is only intended for old-style API usage where
1662 strings were created using _PyObject_New() and where no canonical
1663 representation (the str field) has been set yet aka strings
1664 which are not yet ready. */
1665 assert(_PyUnicode_CHECK(unicode));
1666 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1667 assert(_PyUnicode_WSTR(unicode) != NULL);
1668 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1669 assert(_PyUnicode_UTF8(unicode) == NULL);
1670 /* Actually, it should neither be interned nor be anything else: */
1671 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1672
1673 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1674 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1675 &maxchar, &num_surrogates) == -1)
1676 return -1;
1677
1678 if (maxchar < 256) {
1679 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1680 if (!_PyUnicode_DATA_ANY(unicode)) {
1681 PyErr_NoMemory();
1682 return -1;
1683 }
1684 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1685 _PyUnicode_WSTR(unicode), end,
1686 PyUnicode_1BYTE_DATA(unicode));
1687 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1688 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1689 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1690 if (maxchar < 128) {
1691 _PyUnicode_STATE(unicode).ascii = 1;
1692 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1693 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1694 }
1695 else {
1696 _PyUnicode_STATE(unicode).ascii = 0;
1697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1699 }
1700 PyObject_FREE(_PyUnicode_WSTR(unicode));
1701 _PyUnicode_WSTR(unicode) = NULL;
1702 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1703 }
1704 /* In this case we might have to convert down from 4-byte native
1705 wchar_t to 2-byte unicode. */
1706 else if (maxchar < 65536) {
1707 assert(num_surrogates == 0 &&
1708 "FindMaxCharAndNumSurrogatePairs() messed up");
1709
1710#if SIZEOF_WCHAR_T == 2
1711 /* We can share representations and are done. */
1712 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1713 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1716 _PyUnicode_UTF8(unicode) = NULL;
1717 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1718#else
1719 /* sizeof(wchar_t) == 4 */
1720 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1721 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1722 if (!_PyUnicode_DATA_ANY(unicode)) {
1723 PyErr_NoMemory();
1724 return -1;
1725 }
1726 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1727 _PyUnicode_WSTR(unicode), end,
1728 PyUnicode_2BYTE_DATA(unicode));
1729 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1730 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1731 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1732 _PyUnicode_UTF8(unicode) = NULL;
1733 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1734 PyObject_FREE(_PyUnicode_WSTR(unicode));
1735 _PyUnicode_WSTR(unicode) = NULL;
1736 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737#endif
1738 }
1739 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1740 else {
1741#if SIZEOF_WCHAR_T == 2
1742 /* in case the native representation is 2-bytes, we need to allocate a
1743 new normalized 4-byte version. */
1744 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1745 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1746 PyErr_NoMemory();
1747 return -1;
1748 }
1749 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1750 if (!_PyUnicode_DATA_ANY(unicode)) {
1751 PyErr_NoMemory();
1752 return -1;
1753 }
1754 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1758 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1759 _PyUnicode_STATE(unicode).ready = 1;
1760 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1761 PyObject_FREE(_PyUnicode_WSTR(unicode));
1762 _PyUnicode_WSTR(unicode) = NULL;
1763 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1764#else
1765 assert(num_surrogates == 0);
1766
1767 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1768 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1769 _PyUnicode_UTF8(unicode) = NULL;
1770 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1771 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1772#endif
1773 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1774 }
1775 _PyUnicode_STATE(unicode).ready = 1;
1776 assert(_PyUnicode_CheckConsistency(unicode, 1));
1777 return 0;
1778}
1779
1780static void
1781unicode_dealloc(PyObject *unicode)
1782{
1783 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1784 case SSTATE_NOT_INTERNED:
1785 break;
1786
1787 case SSTATE_INTERNED_MORTAL:
1788 /* revive dead object temporarily for DelItem */
1789 Py_REFCNT(unicode) = 3;
1790 if (PyDict_DelItem(interned, unicode) != 0)
1791 Py_FatalError(
1792 "deletion of interned string failed");
1793 break;
1794
1795 case SSTATE_INTERNED_IMMORTAL:
1796 Py_FatalError("Immortal interned string died.");
1797 /* fall through */
1798
1799 default:
1800 Py_FatalError("Inconsistent interned string state.");
1801 }
1802
1803 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1804 PyObject_DEL(_PyUnicode_WSTR(unicode));
1805 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1806 PyObject_DEL(_PyUnicode_UTF8(unicode));
1807 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1808 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1809
1810 Py_TYPE(unicode)->tp_free(unicode);
1811}
1812
1813#ifdef Py_DEBUG
1814static int
1815unicode_is_singleton(PyObject *unicode)
1816{
1817 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1818 if (unicode == unicode_empty)
1819 return 1;
1820 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1821 {
1822 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1823 if (ch < 256 && unicode_latin1[ch] == unicode)
1824 return 1;
1825 }
1826 return 0;
1827}
1828#endif
1829
1830static int
1831unicode_modifiable(PyObject *unicode)
1832{
1833 assert(_PyUnicode_CHECK(unicode));
1834 if (Py_REFCNT(unicode) != 1)
1835 return 0;
1836 if (_PyUnicode_HASH(unicode) != -1)
1837 return 0;
1838 if (PyUnicode_CHECK_INTERNED(unicode))
1839 return 0;
1840 if (!PyUnicode_CheckExact(unicode))
1841 return 0;
1842#ifdef Py_DEBUG
1843 /* singleton refcount is greater than 1 */
1844 assert(!unicode_is_singleton(unicode));
1845#endif
1846 return 1;
1847}
1848
1849static int
1850unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1851{
1852 PyObject *unicode;
1853 Py_ssize_t old_length;
1854
1855 assert(p_unicode != NULL);
1856 unicode = *p_unicode;
1857
1858 assert(unicode != NULL);
1859 assert(PyUnicode_Check(unicode));
1860 assert(0 <= length);
1861
1862 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1863 old_length = PyUnicode_WSTR_LENGTH(unicode);
1864 else
1865 old_length = PyUnicode_GET_LENGTH(unicode);
1866 if (old_length == length)
1867 return 0;
1868
1869 if (length == 0) {
1870 _Py_INCREF_UNICODE_EMPTY();
1871 if (!unicode_empty)
1872 return -1;
1873 Py_SETREF(*p_unicode, unicode_empty);
1874 return 0;
1875 }
1876
1877 if (!unicode_modifiable(unicode)) {
1878 PyObject *copy = resize_copy(unicode, length);
1879 if (copy == NULL)
1880 return -1;
1881 Py_SETREF(*p_unicode, copy);
1882 return 0;
1883 }
1884
1885 if (PyUnicode_IS_COMPACT(unicode)) {
1886 PyObject *new_unicode = resize_compact(unicode, length);
1887 if (new_unicode == NULL)
1888 return -1;
1889 *p_unicode = new_unicode;
1890 return 0;
1891 }
1892 return resize_inplace(unicode, length);
1893}
1894
1895int
1896PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1897{
1898 PyObject *unicode;
1899 if (p_unicode == NULL) {
1900 PyErr_BadInternalCall();
1901 return -1;
1902 }
1903 unicode = *p_unicode;
1904 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1905 {
1906 PyErr_BadInternalCall();
1907 return -1;
1908 }
1909 return unicode_resize(p_unicode, length);
1910}
1911
1912/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1913
1914 WARNING: The function doesn't copy the terminating null character and
1915 doesn't check the maximum character (may write a latin1 character in an
1916 ASCII string). */
1917static void
1918unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1919 const char *str, Py_ssize_t len)
1920{
1921 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1922 void *data = PyUnicode_DATA(unicode);
1923 const char *end = str + len;
1924
1925 switch (kind) {
1926 case PyUnicode_1BYTE_KIND: {
1927 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1928#ifdef Py_DEBUG
1929 if (PyUnicode_IS_ASCII(unicode)) {
1930 Py_UCS4 maxchar = ucs1lib_find_max_char(
1931 (const Py_UCS1*)str,
1932 (const Py_UCS1*)str + len);
1933 assert(maxchar < 128);
1934 }
1935#endif
1936 memcpy((char *) data + index, str, len);
1937 break;
1938 }
1939 case PyUnicode_2BYTE_KIND: {
1940 Py_UCS2 *start = (Py_UCS2 *)data + index;
1941 Py_UCS2 *ucs2 = start;
1942 assert(index <= PyUnicode_GET_LENGTH(unicode));
1943
1944 for (; str < end; ++ucs2, ++str)
1945 *ucs2 = (Py_UCS2)*str;
1946
1947 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1948 break;
1949 }
1950 default: {
1951 Py_UCS4 *start = (Py_UCS4 *)data + index;
1952 Py_UCS4 *ucs4 = start;
1953 assert(kind == PyUnicode_4BYTE_KIND);
1954 assert(index <= PyUnicode_GET_LENGTH(unicode));
1955
1956 for (; str < end; ++ucs4, ++str)
1957 *ucs4 = (Py_UCS4)*str;
1958
1959 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1960 }
1961 }
1962}
1963
1964static PyObject*
1965get_latin1_char(unsigned char ch)
1966{
1967 PyObject *unicode = unicode_latin1[ch];
1968 if (!unicode) {
1969 unicode = PyUnicode_New(1, ch);
1970 if (!unicode)
1971 return NULL;
1972 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1973 assert(_PyUnicode_CheckConsistency(unicode, 1));
1974 unicode_latin1[ch] = unicode;
1975 }
1976 Py_INCREF(unicode);
1977 return unicode;
1978}
1979
1980static PyObject*
1981unicode_char(Py_UCS4 ch)
1982{
1983 PyObject *unicode;
1984
1985 assert(ch <= MAX_UNICODE);
1986
1987 if (ch < 256)
1988 return get_latin1_char(ch);
1989
1990 unicode = PyUnicode_New(1, ch);
1991 if (unicode == NULL)
1992 return NULL;
1993
1994 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1995 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1996 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1997 } else {
1998 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1999 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2000 }
2001 assert(_PyUnicode_CheckConsistency(unicode, 1));
2002 return unicode;
2003}
2004
2005PyObject *
2006PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2007{
2008 if (u == NULL)
2009 return (PyObject*)_PyUnicode_New(size);
2010
2011 if (size < 0) {
2012 PyErr_BadInternalCall();
2013 return NULL;
2014 }
2015
2016 return PyUnicode_FromWideChar(u, size);
2017}
2018
2019PyObject *
2020PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2021{
2022 PyObject *unicode;
2023 Py_UCS4 maxchar = 0;
2024 Py_ssize_t num_surrogates;
2025
2026 if (u == NULL && size != 0) {
2027 PyErr_BadInternalCall();
2028 return NULL;
2029 }
2030
2031 if (size == -1) {
2032 size = wcslen(u);
2033 }
2034
2035 /* If the Unicode data is known at construction time, we can apply
2036 some optimizations which share commonly used objects. */
2037
2038 /* Optimization for empty strings */
2039 if (size == 0)
2040 _Py_RETURN_UNICODE_EMPTY();
2041
2042 /* Single character Unicode objects in the Latin-1 range are
2043 shared when using this constructor */
2044 if (size == 1 && (Py_UCS4)*u < 256)
2045 return get_latin1_char((unsigned char)*u);
2046
2047 /* If not empty and not single character, copy the Unicode data
2048 into the new object */
2049 if (find_maxchar_surrogates(u, u + size,
2050 &maxchar, &num_surrogates) == -1)
2051 return NULL;
2052
2053 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2054 if (!unicode)
2055 return NULL;
2056
2057 switch (PyUnicode_KIND(unicode)) {
2058 case PyUnicode_1BYTE_KIND:
2059 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2060 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2061 break;
2062 case PyUnicode_2BYTE_KIND:
2063#if Py_UNICODE_SIZE == 2
2064 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2065#else
2066 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2067 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2068#endif
2069 break;
2070 case PyUnicode_4BYTE_KIND:
2071#if SIZEOF_WCHAR_T == 2
2072 /* This is the only case which has to process surrogates, thus
2073 a simple copy loop is not enough and we need a function. */
2074 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2075#else
2076 assert(num_surrogates == 0);
2077 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2078#endif
2079 break;
2080 default:
2081 assert(0 && "Impossible state");
2082 }
2083
2084 return unicode_result(unicode);
2085}
2086
2087PyObject *
2088PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2089{
2090 if (size < 0) {
2091 PyErr_SetString(PyExc_SystemError,
2092 "Negative size passed to PyUnicode_FromStringAndSize");
2093 return NULL;
2094 }
2095 if (u != NULL)
2096 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2097 else
2098 return (PyObject *)_PyUnicode_New(size);
2099}
2100
2101PyObject *
2102PyUnicode_FromString(const char *u)
2103{
2104 size_t size = strlen(u);
2105 if (size > PY_SSIZE_T_MAX) {
2106 PyErr_SetString(PyExc_OverflowError, "input too long");
2107 return NULL;
2108 }
2109 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2110}
2111
2112PyObject *
2113_PyUnicode_FromId(_Py_Identifier *id)
2114{
2115 if (!id->object) {
2116 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2117 strlen(id->string),
2118 NULL, NULL);
2119 if (!id->object)
2120 return NULL;
2121 PyUnicode_InternInPlace(&id->object);
2122 assert(!id->next);
2123 id->next = static_strings;
2124 static_strings = id;
2125 }
2126 return id->object;
2127}
2128
2129void
2130_PyUnicode_ClearStaticStrings()
2131{
2132 _Py_Identifier *tmp, *s = static_strings;
2133 while (s) {
2134 Py_CLEAR(s->object);
2135 tmp = s->next;
2136 s->next = NULL;
2137 s = tmp;
2138 }
2139 static_strings = NULL;
2140}
2141
2142/* Internal function, doesn't check maximum character */
2143
2144PyObject*
2145_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2146{
2147 const unsigned char *s = (const unsigned char *)buffer;
2148 PyObject *unicode;
2149 if (size == 1) {
2150#ifdef Py_DEBUG
2151 assert((unsigned char)s[0] < 128);
2152#endif
2153 return get_latin1_char(s[0]);
2154 }
2155 unicode = PyUnicode_New(size, 127);
2156 if (!unicode)
2157 return NULL;
2158 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2159 assert(_PyUnicode_CheckConsistency(unicode, 1));
2160 return unicode;
2161}
2162
2163static Py_UCS4
2164kind_maxchar_limit(unsigned int kind)
2165{
2166 switch (kind) {
2167 case PyUnicode_1BYTE_KIND:
2168 return 0x80;
2169 case PyUnicode_2BYTE_KIND:
2170 return 0x100;
2171 case PyUnicode_4BYTE_KIND:
2172 return 0x10000;
2173 default:
2174 assert(0 && "invalid kind");
2175 return MAX_UNICODE;
2176 }
2177}
2178
2179static inline Py_UCS4
2180align_maxchar(Py_UCS4 maxchar)
2181{
2182 if (maxchar <= 127)
2183 return 127;
2184 else if (maxchar <= 255)
2185 return 255;
2186 else if (maxchar <= 65535)
2187 return 65535;
2188 else
2189 return MAX_UNICODE;
2190}
2191
2192static PyObject*
2193_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2194{
2195 PyObject *res;
2196 unsigned char max_char;
2197
2198 if (size == 0)
2199 _Py_RETURN_UNICODE_EMPTY();
2200 assert(size > 0);
2201 if (size == 1)
2202 return get_latin1_char(u[0]);
2203
2204 max_char = ucs1lib_find_max_char(u, u + size);
2205 res = PyUnicode_New(size, max_char);
2206 if (!res)
2207 return NULL;
2208 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2209 assert(_PyUnicode_CheckConsistency(res, 1));
2210 return res;
2211}
2212
2213static PyObject*
2214_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2215{
2216 PyObject *res;
2217 Py_UCS2 max_char;
2218
2219 if (size == 0)
2220 _Py_RETURN_UNICODE_EMPTY();
2221 assert(size > 0);
2222 if (size == 1)
2223 return unicode_char(u[0]);
2224
2225 max_char = ucs2lib_find_max_char(u, u + size);
2226 res = PyUnicode_New(size, max_char);
2227 if (!res)
2228 return NULL;
2229 if (max_char >= 256)
2230 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2231 else {
2232 _PyUnicode_CONVERT_BYTES(
2233 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2234 }
2235 assert(_PyUnicode_CheckConsistency(res, 1));
2236 return res;
2237}
2238
2239static PyObject*
2240_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2241{
2242 PyObject *res;
2243 Py_UCS4 max_char;
2244
2245 if (size == 0)
2246 _Py_RETURN_UNICODE_EMPTY();
2247 assert(size > 0);
2248 if (size == 1)
2249 return unicode_char(u[0]);
2250
2251 max_char = ucs4lib_find_max_char(u, u + size);
2252 res = PyUnicode_New(size, max_char);
2253 if (!res)
2254 return NULL;
2255 if (max_char < 256)
2256 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2257 PyUnicode_1BYTE_DATA(res));
2258 else if (max_char < 0x10000)
2259 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2260 PyUnicode_2BYTE_DATA(res));
2261 else
2262 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2263 assert(_PyUnicode_CheckConsistency(res, 1));
2264 return res;
2265}
2266
2267PyObject*
2268PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2269{
2270 if (size < 0) {
2271 PyErr_SetString(PyExc_ValueError, "size must be positive");
2272 return NULL;
2273 }
2274 switch (kind) {
2275 case PyUnicode_1BYTE_KIND:
2276 return _PyUnicode_FromUCS1(buffer, size);
2277 case PyUnicode_2BYTE_KIND:
2278 return _PyUnicode_FromUCS2(buffer, size);
2279 case PyUnicode_4BYTE_KIND:
2280 return _PyUnicode_FromUCS4(buffer, size);
2281 default:
2282 PyErr_SetString(PyExc_SystemError, "invalid kind");
2283 return NULL;
2284 }
2285}
2286
2287Py_UCS4
2288_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2289{
2290 enum PyUnicode_Kind kind;
2291 void *startptr, *endptr;
2292
2293 assert(PyUnicode_IS_READY(unicode));
2294 assert(0 <= start);
2295 assert(end <= PyUnicode_GET_LENGTH(unicode));
2296 assert(start <= end);
2297
2298 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2299 return PyUnicode_MAX_CHAR_VALUE(unicode);
2300
2301 if (start == end)
2302 return 127;
2303
2304 if (PyUnicode_IS_ASCII(unicode))
2305 return 127;
2306
2307 kind = PyUnicode_KIND(unicode);
2308 startptr = PyUnicode_DATA(unicode);
2309 endptr = (char *)startptr + end * kind;
2310 startptr = (char *)startptr + start * kind;
2311 switch(kind) {
2312 case PyUnicode_1BYTE_KIND:
2313 return ucs1lib_find_max_char(startptr, endptr);
2314 case PyUnicode_2BYTE_KIND:
2315 return ucs2lib_find_max_char(startptr, endptr);
2316 case PyUnicode_4BYTE_KIND:
2317 return ucs4lib_find_max_char(startptr, endptr);
2318 default:
2319 assert(0);
2320 return 0;
2321 }
2322}
2323
2324/* Ensure that a string uses the most efficient storage, if it is not the
2325 case: create a new string with of the right kind. Write NULL into *p_unicode
2326 on error. */
2327static void
2328unicode_adjust_maxchar(PyObject **p_unicode)
2329{
2330 PyObject *unicode, *copy;
2331 Py_UCS4 max_char;
2332 Py_ssize_t len;
2333 unsigned int kind;
2334
2335 assert(p_unicode != NULL);
2336 unicode = *p_unicode;
2337 assert(PyUnicode_IS_READY(unicode));
2338 if (PyUnicode_IS_ASCII(unicode))
2339 return;
2340
2341 len = PyUnicode_GET_LENGTH(unicode);
2342 kind = PyUnicode_KIND(unicode);
2343 if (kind == PyUnicode_1BYTE_KIND) {
2344 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2345 max_char = ucs1lib_find_max_char(u, u + len);
2346 if (max_char >= 128)
2347 return;
2348 }
2349 else if (kind == PyUnicode_2BYTE_KIND) {
2350 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2351 max_char = ucs2lib_find_max_char(u, u + len);
2352 if (max_char >= 256)
2353 return;
2354 }
2355 else {
2356 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2357 assert(kind == PyUnicode_4BYTE_KIND);
2358 max_char = ucs4lib_find_max_char(u, u + len);
2359 if (max_char >= 0x10000)
2360 return;
2361 }
2362 copy = PyUnicode_New(len, max_char);
2363 if (copy != NULL)
2364 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2365 Py_DECREF(unicode);
2366 *p_unicode = copy;
2367}
2368
2369PyObject*
2370_PyUnicode_Copy(PyObject *unicode)
2371{
2372 Py_ssize_t length;
2373 PyObject *copy;
2374
2375 if (!PyUnicode_Check(unicode)) {
2376 PyErr_BadInternalCall();
2377 return NULL;
2378 }
2379 if (PyUnicode_READY(unicode) == -1)
2380 return NULL;
2381
2382 length = PyUnicode_GET_LENGTH(unicode);
2383 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2384 if (!copy)
2385 return NULL;
2386 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2387
2388 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2389 length * PyUnicode_KIND(unicode));
2390 assert(_PyUnicode_CheckConsistency(copy, 1));
2391 return copy;
2392}
2393
2394
2395/* Widen Unicode objects to larger buffers. Don't write terminating null
2396 character. Return NULL on error. */
2397
2398void*
2399_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2400{
2401 Py_ssize_t len;
2402 void *result;
2403 unsigned int skind;
2404
2405 if (PyUnicode_READY(s) == -1)
2406 return NULL;
2407
2408 len = PyUnicode_GET_LENGTH(s);
2409 skind = PyUnicode_KIND(s);
2410 if (skind >= kind) {
2411 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2412 return NULL;
2413 }
2414 switch (kind) {
2415 case PyUnicode_2BYTE_KIND:
2416 result = PyMem_New(Py_UCS2, len);
2417 if (!result)
2418 return PyErr_NoMemory();
2419 assert(skind == PyUnicode_1BYTE_KIND);
2420 _PyUnicode_CONVERT_BYTES(
2421 Py_UCS1, Py_UCS2,
2422 PyUnicode_1BYTE_DATA(s),
2423 PyUnicode_1BYTE_DATA(s) + len,
2424 result);
2425 return result;
2426 case PyUnicode_4BYTE_KIND:
2427 result = PyMem_New(Py_UCS4, len);
2428 if (!result)
2429 return PyErr_NoMemory();
2430 if (skind == PyUnicode_2BYTE_KIND) {
2431 _PyUnicode_CONVERT_BYTES(
2432 Py_UCS2, Py_UCS4,
2433 PyUnicode_2BYTE_DATA(s),
2434 PyUnicode_2BYTE_DATA(s) + len,
2435 result);
2436 }
2437 else {
2438 assert(skind == PyUnicode_1BYTE_KIND);
2439 _PyUnicode_CONVERT_BYTES(
2440 Py_UCS1, Py_UCS4,
2441 PyUnicode_1BYTE_DATA(s),
2442 PyUnicode_1BYTE_DATA(s) + len,
2443 result);
2444 }
2445 return result;
2446 default:
2447 break;
2448 }
2449 PyErr_SetString(PyExc_SystemError, "invalid kind");
2450 return NULL;
2451}
2452
2453static Py_UCS4*
2454as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2455 int copy_null)
2456{
2457 int kind;
2458 void *data;
2459 Py_ssize_t len, targetlen;
2460 if (PyUnicode_READY(string) == -1)
2461 return NULL;
2462 kind = PyUnicode_KIND(string);
2463 data = PyUnicode_DATA(string);
2464 len = PyUnicode_GET_LENGTH(string);
2465 targetlen = len;
2466 if (copy_null)
2467 targetlen++;
2468 if (!target) {
2469 target = PyMem_New(Py_UCS4, targetlen);
2470 if (!target) {
2471 PyErr_NoMemory();
2472 return NULL;
2473 }
2474 }
2475 else {
2476 if (targetsize < targetlen) {
2477 PyErr_Format(PyExc_SystemError,
2478 "string is longer than the buffer");
2479 if (copy_null && 0 < targetsize)
2480 target[0] = 0;
2481 return NULL;
2482 }
2483 }
2484 if (kind == PyUnicode_1BYTE_KIND) {
2485 Py_UCS1 *start = (Py_UCS1 *) data;
2486 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2487 }
2488 else if (kind == PyUnicode_2BYTE_KIND) {
2489 Py_UCS2 *start = (Py_UCS2 *) data;
2490 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2491 }
2492 else {
2493 assert(kind == PyUnicode_4BYTE_KIND);
2494 memcpy(target, data, len * sizeof(Py_UCS4));
2495 }
2496 if (copy_null)
2497 target[len] = 0;
2498 return target;
2499}
2500
2501Py_UCS4*
2502PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2503 int copy_null)
2504{
2505 if (target == NULL || targetsize < 0) {
2506 PyErr_BadInternalCall();
2507 return NULL;
2508 }
2509 return as_ucs4(string, target, targetsize, copy_null);
2510}
2511
2512Py_UCS4*
2513PyUnicode_AsUCS4Copy(PyObject *string)
2514{
2515 return as_ucs4(string, NULL, 0, 1);
2516}
2517
2518/* maximum number of characters required for output of %lld or %p.
2519 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2520 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2521#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2522
2523static int
2524unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2525 Py_ssize_t width, Py_ssize_t precision)
2526{
2527 Py_ssize_t length, fill, arglen;
2528 Py_UCS4 maxchar;
2529
2530 if (PyUnicode_READY(str) == -1)
2531 return -1;
2532
2533 length = PyUnicode_GET_LENGTH(str);
2534 if ((precision == -1 || precision >= length)
2535 && width <= length)
2536 return _PyUnicodeWriter_WriteStr(writer, str);
2537
2538 if (precision != -1)
2539 length = Py_MIN(precision, length);
2540
2541 arglen = Py_MAX(length, width);
2542 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2543 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2544 else
2545 maxchar = writer->maxchar;
2546
2547 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2548 return -1;
2549
2550 if (width > length) {
2551 fill = width - length;
2552 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2553 return -1;
2554 writer->pos += fill;
2555 }
2556
2557 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2558 str, 0, length);
2559 writer->pos += length;
2560 return 0;
2561}
2562
2563static int
2564unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2565 Py_ssize_t width, Py_ssize_t precision)
2566{
2567 /* UTF-8 */
2568 Py_ssize_t length;
2569 PyObject *unicode;
2570 int res;
2571
2572 length = strlen(str);
2573 if (precision != -1)
2574 length = Py_MIN(length, precision);
2575 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2576 if (unicode == NULL)
2577 return -1;
2578
2579 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2580 Py_DECREF(unicode);
2581 return res;
2582}
2583
2584static const char*
2585unicode_fromformat_arg(_PyUnicodeWriter *writer,
2586 const char *f, va_list *vargs)
2587{
2588 const char *p;
2589 Py_ssize_t len;
2590 int zeropad;
2591 Py_ssize_t width;
2592 Py_ssize_t precision;
2593 int longflag;
2594 int longlongflag;
2595 int size_tflag;
2596 Py_ssize_t fill;
2597
2598 p = f;
2599 f++;
2600 zeropad = 0;
2601 if (*f == '0') {
2602 zeropad = 1;
2603 f++;
2604 }
2605
2606 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2607 width = -1;
2608 if (Py_ISDIGIT((unsigned)*f)) {
2609 width = *f - '0';
2610 f++;
2611 while (Py_ISDIGIT((unsigned)*f)) {
2612 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2613 PyErr_SetString(PyExc_ValueError,
2614 "width too big");
2615 return NULL;
2616 }
2617 width = (width * 10) + (*f - '0');
2618 f++;
2619 }
2620 }
2621 precision = -1;
2622 if (*f == '.') {
2623 f++;
2624 if (Py_ISDIGIT((unsigned)*f)) {
2625 precision = (*f - '0');
2626 f++;
2627 while (Py_ISDIGIT((unsigned)*f)) {
2628 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2629 PyErr_SetString(PyExc_ValueError,
2630 "precision too big");
2631 return NULL;
2632 }
2633 precision = (precision * 10) + (*f - '0');
2634 f++;
2635 }
2636 }
2637 if (*f == '%') {
2638 /* "%.3%s" => f points to "3" */
2639 f--;
2640 }
2641 }
2642 if (*f == '\0') {
2643 /* bogus format "%.123" => go backward, f points to "3" */
2644 f--;
2645 }
2646
2647 /* Handle %ld, %lu, %lld and %llu. */
2648 longflag = 0;
2649 longlongflag = 0;
2650 size_tflag = 0;
2651 if (*f == 'l') {
2652 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2653 longflag = 1;
2654 ++f;
2655 }
2656 else if (f[1] == 'l' &&
2657 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2658 longlongflag = 1;
2659 f += 2;
2660 }
2661 }
2662 /* handle the size_t flag. */
2663 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2664 size_tflag = 1;
2665 ++f;
2666 }
2667
2668 if (f[1] == '\0')
2669 writer->overallocate = 0;
2670
2671 switch (*f) {
2672 case 'c':
2673 {
2674 int ordinal = va_arg(*vargs, int);
2675 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2676 PyErr_SetString(PyExc_OverflowError,
2677 "character argument not in range(0x110000)");
2678 return NULL;
2679 }
2680 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2681 return NULL;
2682 break;
2683 }
2684
2685 case 'i':
2686 case 'd':
2687 case 'u':
2688 case 'x':
2689 {
2690 /* used by sprintf */
2691 char buffer[MAX_LONG_LONG_CHARS];
2692 Py_ssize_t arglen;
2693
2694 if (*f == 'u') {
2695 if (longflag)
2696 len = sprintf(buffer, "%lu",
2697 va_arg(*vargs, unsigned long));
2698 else if (longlongflag)
2699 len = sprintf(buffer, "%llu",
2700 va_arg(*vargs, unsigned long long));
2701 else if (size_tflag)
2702 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2703 va_arg(*vargs, size_t));
2704 else
2705 len = sprintf(buffer, "%u",
2706 va_arg(*vargs, unsigned int));
2707 }
2708 else if (*f == 'x') {
2709 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2710 }
2711 else {
2712 if (longflag)
2713 len = sprintf(buffer, "%li",
2714 va_arg(*vargs, long));
2715 else if (longlongflag)
2716 len = sprintf(buffer, "%lli",
2717 va_arg(*vargs, long long));
2718 else if (size_tflag)
2719 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2720 va_arg(*vargs, Py_ssize_t));
2721 else
2722 len = sprintf(buffer, "%i",
2723 va_arg(*vargs, int));
2724 }
2725 assert(len >= 0);
2726
2727 if (precision < len)
2728 precision = len;
2729
2730 arglen = Py_MAX(precision, width);
2731 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2732 return NULL;
2733
2734 if (width > precision) {
2735 Py_UCS4 fillchar;
2736 fill = width - precision;
2737 fillchar = zeropad?'0':' ';
2738 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2739 return NULL;
2740 writer->pos += fill;
2741 }
2742 if (precision > len) {
2743 fill = precision - len;
2744 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2745 return NULL;
2746 writer->pos += fill;
2747 }
2748
2749 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2750 return NULL;
2751 break;
2752 }
2753
2754 case 'p':
2755 {
2756 char number[MAX_LONG_LONG_CHARS];
2757
2758 len = sprintf(number, "%p", va_arg(*vargs, void*));
2759 assert(len >= 0);
2760
2761 /* %p is ill-defined: ensure leading 0x. */
2762 if (number[1] == 'X')
2763 number[1] = 'x';
2764 else if (number[1] != 'x') {
2765 memmove(number + 2, number,
2766 strlen(number) + 1);
2767 number[0] = '0';
2768 number[1] = 'x';
2769 len += 2;
2770 }
2771
2772 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2773 return NULL;
2774 break;
2775 }
2776
2777 case 's':
2778 {
2779 /* UTF-8 */
2780 const char *s = va_arg(*vargs, const char*);
2781 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2782 return NULL;
2783 break;
2784 }
2785
2786 case 'U':
2787 {
2788 PyObject *obj = va_arg(*vargs, PyObject *);
2789 assert(obj && _PyUnicode_CHECK(obj));
2790
2791 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2792 return NULL;
2793 break;
2794 }
2795
2796 case 'V':
2797 {
2798 PyObject *obj = va_arg(*vargs, PyObject *);
2799 const char *str = va_arg(*vargs, const char *);
2800 if (obj) {
2801 assert(_PyUnicode_CHECK(obj));
2802 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2803 return NULL;
2804 }
2805 else {
2806 assert(str != NULL);
2807 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2808 return NULL;
2809 }
2810 break;
2811 }
2812
2813 case 'S':
2814 {
2815 PyObject *obj = va_arg(*vargs, PyObject *);
2816 PyObject *str;
2817 assert(obj);
2818 str = PyObject_Str(obj);
2819 if (!str)
2820 return NULL;
2821 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2822 Py_DECREF(str);
2823 return NULL;
2824 }
2825 Py_DECREF(str);
2826 break;
2827 }
2828
2829 case 'R':
2830 {
2831 PyObject *obj = va_arg(*vargs, PyObject *);
2832 PyObject *repr;
2833 assert(obj);
2834 repr = PyObject_Repr(obj);
2835 if (!repr)
2836 return NULL;
2837 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2838 Py_DECREF(repr);
2839 return NULL;
2840 }
2841 Py_DECREF(repr);
2842 break;
2843 }
2844
2845 case 'A':
2846 {
2847 PyObject *obj = va_arg(*vargs, PyObject *);
2848 PyObject *ascii;
2849 assert(obj);
2850 ascii = PyObject_ASCII(obj);
2851 if (!ascii)
2852 return NULL;
2853 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2854 Py_DECREF(ascii);
2855 return NULL;
2856 }
2857 Py_DECREF(ascii);
2858 break;
2859 }
2860
2861 case '%':
2862 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2863 return NULL;
2864 break;
2865
2866 default:
2867 /* if we stumble upon an unknown formatting code, copy the rest
2868 of the format string to the output string. (we cannot just
2869 skip the code, since there's no way to know what's in the
2870 argument list) */
2871 len = strlen(p);
2872 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2873 return NULL;
2874 f = p+len;
2875 return f;
2876 }
2877
2878 f++;
2879 return f;
2880}
2881
2882PyObject *
2883PyUnicode_FromFormatV(const char *format, va_list vargs)
2884{
2885 va_list vargs2;
2886 const char *f;
2887 _PyUnicodeWriter writer;
2888
2889 _PyUnicodeWriter_Init(&writer);
2890 writer.min_length = strlen(format) + 100;
2891 writer.overallocate = 1;
2892
2893 // Copy varags to be able to pass a reference to a subfunction.
2894 va_copy(vargs2, vargs);
2895
2896 for (f = format; *f; ) {
2897 if (*f == '%') {
2898 f = unicode_fromformat_arg(&writer, f, &vargs2);
2899 if (f == NULL)
2900 goto fail;
2901 }
2902 else {
2903 const char *p;
2904 Py_ssize_t len;
2905
2906 p = f;
2907 do
2908 {
2909 if ((unsigned char)*p > 127) {
2910 PyErr_Format(PyExc_ValueError,
2911 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2912 "string, got a non-ASCII byte: 0x%02x",
2913 (unsigned char)*p);
2914 goto fail;
2915 }
2916 p++;
2917 }
2918 while (*p != '\0' && *p != '%');
2919 len = p - f;
2920
2921 if (*p == '\0')
2922 writer.overallocate = 0;
2923
2924 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2925 goto fail;
2926
2927 f = p;
2928 }
2929 }
2930 va_end(vargs2);
2931 return _PyUnicodeWriter_Finish(&writer);
2932
2933 fail:
2934 va_end(vargs2);
2935 _PyUnicodeWriter_Dealloc(&writer);
2936 return NULL;
2937}
2938
2939PyObject *
2940PyUnicode_FromFormat(const char *format, ...)
2941{
2942 PyObject* ret;
2943 va_list vargs;
2944
2945#ifdef HAVE_STDARG_PROTOTYPES
2946 va_start(vargs, format);
2947#else
2948 va_start(vargs);
2949#endif
2950 ret = PyUnicode_FromFormatV(format, vargs);
2951 va_end(vargs);
2952 return ret;
2953}
2954
2955#ifdef HAVE_WCHAR_H
2956
2957/* Convert a Unicode object to a wide character string.
2958
2959 - If w is NULL: return the number of wide characters (including the null
2960 character) required to convert the unicode object. Ignore size argument.
2961
2962 - Otherwise: return the number of wide characters (excluding the null
2963 character) written into w. Write at most size wide characters (including
2964 the null character). */
2965Py_ssize_t
2966PyUnicode_AsWideChar(PyObject *unicode,
2967 wchar_t *w,
2968 Py_ssize_t size)
2969{
2970 Py_ssize_t res;
2971 const wchar_t *wstr;
2972
2973 if (unicode == NULL) {
2974 PyErr_BadInternalCall();
2975 return -1;
2976 }
2977 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2978 if (wstr == NULL)
2979 return -1;
2980
2981 if (w != NULL) {
2982 if (size > res)
2983 size = res + 1;
2984 else
2985 res = size;
2986 memcpy(w, wstr, size * sizeof(wchar_t));
2987 return res;
2988 }
2989 else
2990 return res + 1;
2991}
2992
2993wchar_t*
2994PyUnicode_AsWideCharString(PyObject *unicode,
2995 Py_ssize_t *size)
2996{
2997 const wchar_t *wstr;
2998 wchar_t *buffer;
2999 Py_ssize_t buflen;
3000
3001 if (unicode == NULL) {
3002 PyErr_BadInternalCall();
3003 return NULL;
3004 }
3005
3006 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3007 if (wstr == NULL) {
3008 return NULL;
3009 }
3010 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3011 PyErr_SetString(PyExc_ValueError,
3012 "embedded null character");
3013 return NULL;
3014 }
3015
3016 buffer = PyMem_NEW(wchar_t, buflen + 1);
3017 if (buffer == NULL) {
3018 PyErr_NoMemory();
3019 return NULL;
3020 }
3021 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
3022 if (size != NULL)
3023 *size = buflen;
3024 return buffer;
3025}
3026
3027#endif /* HAVE_WCHAR_H */
3028
3029PyObject *
3030PyUnicode_FromOrdinal(int ordinal)
3031{
3032 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3033 PyErr_SetString(PyExc_ValueError,
3034 "chr() arg not in range(0x110000)");
3035 return NULL;
3036 }
3037
3038 return unicode_char((Py_UCS4)ordinal);
3039}
3040
3041PyObject *
3042PyUnicode_FromObject(PyObject *obj)
3043{
3044 /* XXX Perhaps we should make this API an alias of
3045 PyObject_Str() instead ?! */
3046 if (PyUnicode_CheckExact(obj)) {
3047 if (PyUnicode_READY(obj) == -1)
3048 return NULL;
3049 Py_INCREF(obj);
3050 return obj;
3051 }
3052 if (PyUnicode_Check(obj)) {
3053 /* For a Unicode subtype that's not a Unicode object,
3054 return a true Unicode object with the same data. */
3055 return _PyUnicode_Copy(obj);
3056 }
3057 PyErr_Format(PyExc_TypeError,
3058 "Can't convert '%.100s' object to str implicitly",
3059 Py_TYPE(obj)->tp_name);
3060 return NULL;
3061}
3062
3063PyObject *
3064PyUnicode_FromEncodedObject(PyObject *obj,
3065 const char *encoding,
3066 const char *errors)
3067{
3068 Py_buffer buffer;
3069 PyObject *v;
3070
3071 if (obj == NULL) {
3072 PyErr_BadInternalCall();
3073 return NULL;
3074 }
3075
3076 /* Decoding bytes objects is the most common case and should be fast */
3077 if (PyBytes_Check(obj)) {
3078 if (PyBytes_GET_SIZE(obj) == 0)
3079 _Py_RETURN_UNICODE_EMPTY();
3080 v = PyUnicode_Decode(
3081 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3082 encoding, errors);
3083 return v;
3084 }
3085
3086 if (PyUnicode_Check(obj)) {
3087 PyErr_SetString(PyExc_TypeError,
3088 "decoding str is not supported");
3089 return NULL;
3090 }
3091
3092 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3093 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3094 PyErr_Format(PyExc_TypeError,
3095 "decoding to str: need a bytes-like object, %.80s found",
3096 Py_TYPE(obj)->tp_name);
3097 return NULL;
3098 }
3099
3100 if (buffer.len == 0) {
3101 PyBuffer_Release(&buffer);
3102 _Py_RETURN_UNICODE_EMPTY();
3103 }
3104
3105 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3106 PyBuffer_Release(&buffer);
3107 return v;
3108}
3109
3110/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3111 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3112 longer than lower_len-1). */
3113int
3114_Py_normalize_encoding(const char *encoding,
3115 char *lower,
3116 size_t lower_len)
3117{
3118 const char *e;
3119 char *l;
3120 char *l_end;
3121 int punct;
3122
3123 assert(encoding != NULL);
3124
3125 e = encoding;
3126 l = lower;
3127 l_end = &lower[lower_len - 1];
3128 punct = 0;
3129 while (1) {
3130 char c = *e;
3131 if (c == 0) {
3132 break;
3133 }
3134
3135 if (Py_ISALNUM(c) || c == '.') {
3136 if (punct && l != lower) {
3137 if (l == l_end) {
3138 return 0;
3139 }
3140 *l++ = '_';
3141 }
3142 punct = 0;
3143
3144 if (l == l_end) {
3145 return 0;
3146 }
3147 *l++ = Py_TOLOWER(c);
3148 }
3149 else {
3150 punct = 1;
3151 }
3152
3153 e++;
3154 }
3155 *l = '\0';
3156 return 1;
3157}
3158
3159PyObject *
3160PyUnicode_Decode(const char *s,
3161 Py_ssize_t size,
3162 const char *encoding,
3163 const char *errors)
3164{
3165 PyObject *buffer = NULL, *unicode;
3166 Py_buffer info;
3167 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3168
3169 if (encoding == NULL) {
3170 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3171 }
3172
3173 /* Shortcuts for common default encodings */
3174 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3175 char *lower = buflower;
3176
3177 /* Fast paths */
3178 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3179 lower += 3;
3180 if (*lower == '_') {
3181 /* Match "utf8" and "utf_8" */
3182 lower++;
3183 }
3184
3185 if (lower[0] == '8' && lower[1] == 0) {
3186 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3187 }
3188 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3189 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3190 }
3191 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3192 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3193 }
3194 }
3195 else {
3196 if (strcmp(lower, "ascii") == 0
3197 || strcmp(lower, "us_ascii") == 0) {
3198 return PyUnicode_DecodeASCII(s, size, errors);
3199 }
3200 #ifdef MS_WINDOWS
3201 else if (strcmp(lower, "mbcs") == 0) {
3202 return PyUnicode_DecodeMBCS(s, size, errors);
3203 }
3204 #endif
3205 else if (strcmp(lower, "latin1") == 0
3206 || strcmp(lower, "latin_1") == 0
3207 || strcmp(lower, "iso_8859_1") == 0
3208 || strcmp(lower, "iso8859_1") == 0) {
3209 return PyUnicode_DecodeLatin1(s, size, errors);
3210 }
3211 }
3212 }
3213
3214 /* Decode via the codec registry */
3215 buffer = NULL;
3216 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3217 goto onError;
3218 buffer = PyMemoryView_FromBuffer(&info);
3219 if (buffer == NULL)
3220 goto onError;
3221 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3222 if (unicode == NULL)
3223 goto onError;
3224 if (!PyUnicode_Check(unicode)) {
3225 PyErr_Format(PyExc_TypeError,
3226 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3227 "use codecs.decode() to decode to arbitrary types",
3228 encoding,
3229 Py_TYPE(unicode)->tp_name);
3230 Py_DECREF(unicode);
3231 goto onError;
3232 }
3233 Py_DECREF(buffer);
3234 return unicode_result(unicode);
3235
3236 onError:
3237 Py_XDECREF(buffer);
3238 return NULL;
3239}
3240
3241PyObject *
3242PyUnicode_AsDecodedObject(PyObject *unicode,
3243 const char *encoding,
3244 const char *errors)
3245{
3246 if (!PyUnicode_Check(unicode)) {
3247 PyErr_BadArgument();
3248 return NULL;
3249 }
3250
3251 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3252 "PyUnicode_AsDecodedObject() is deprecated; "
3253 "use PyCodec_Decode() to decode from str", 1) < 0)
3254 return NULL;
3255
3256 if (encoding == NULL)
3257 encoding = PyUnicode_GetDefaultEncoding();
3258
3259 /* Decode via the codec registry */
3260 return PyCodec_Decode(unicode, encoding, errors);
3261}
3262
3263PyObject *
3264PyUnicode_AsDecodedUnicode(PyObject *unicode,
3265 const char *encoding,
3266 const char *errors)
3267{
3268 PyObject *v;
3269
3270 if (!PyUnicode_Check(unicode)) {
3271 PyErr_BadArgument();
3272 goto onError;
3273 }
3274
3275 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3276 "PyUnicode_AsDecodedUnicode() is deprecated; "
3277 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3278 return NULL;
3279
3280 if (encoding == NULL)
3281 encoding = PyUnicode_GetDefaultEncoding();
3282
3283 /* Decode via the codec registry */
3284 v = PyCodec_Decode(unicode, encoding, errors);
3285 if (v == NULL)
3286 goto onError;
3287 if (!PyUnicode_Check(v)) {
3288 PyErr_Format(PyExc_TypeError,
3289 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3290 "use codecs.decode() to decode to arbitrary types",
3291 encoding,
3292 Py_TYPE(unicode)->tp_name);
3293 Py_DECREF(v);
3294 goto onError;
3295 }
3296 return unicode_result(v);
3297
3298 onError:
3299 return NULL;
3300}
3301
3302PyObject *
3303PyUnicode_Encode(const Py_UNICODE *s,
3304 Py_ssize_t size,
3305 const char *encoding,
3306 const char *errors)
3307{
3308 PyObject *v, *unicode;
3309
3310 unicode = PyUnicode_FromWideChar(s, size);
3311 if (unicode == NULL)
3312 return NULL;
3313 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3314 Py_DECREF(unicode);
3315 return v;
3316}
3317
3318PyObject *
3319PyUnicode_AsEncodedObject(PyObject *unicode,
3320 const char *encoding,
3321 const char *errors)
3322{
3323 PyObject *v;
3324
3325 if (!PyUnicode_Check(unicode)) {
3326 PyErr_BadArgument();
3327 goto onError;
3328 }
3329
3330 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3331 "PyUnicode_AsEncodedObject() is deprecated; "
3332 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3333 "or PyCodec_Encode() for generic encoding", 1) < 0)
3334 return NULL;
3335
3336 if (encoding == NULL)
3337 encoding = PyUnicode_GetDefaultEncoding();
3338
3339 /* Encode via the codec registry */
3340 v = PyCodec_Encode(unicode, encoding, errors);
3341 if (v == NULL)
3342 goto onError;
3343 return v;
3344
3345 onError:
3346 return NULL;
3347}
3348
3349static size_t
3350wcstombs_errorpos(const wchar_t *wstr)
3351{
3352 size_t len;
3353#if SIZEOF_WCHAR_T == 2
3354 wchar_t buf[3];
3355#else
3356 wchar_t buf[2];
3357#endif
3358 char outbuf[MB_LEN_MAX];
3359 const wchar_t *start, *previous;
3360
3361#if SIZEOF_WCHAR_T == 2
3362 buf[2] = 0;
3363#else
3364 buf[1] = 0;
3365#endif
3366 start = wstr;
3367 while (*wstr != L'\0')
3368 {
3369 previous = wstr;
3370#if SIZEOF_WCHAR_T == 2
3371 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3372 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3373 {
3374 buf[0] = wstr[0];
3375 buf[1] = wstr[1];
3376 wstr += 2;
3377 }
3378 else {
3379 buf[0] = *wstr;
3380 buf[1] = 0;
3381 wstr++;
3382 }
3383#else
3384 buf[0] = *wstr;
3385 wstr++;
3386#endif
3387 len = wcstombs(outbuf, buf, sizeof(outbuf));
3388 if (len == (size_t)-1)
3389 return previous - start;
3390 }
3391
3392 /* failed to find the unencodable character */
3393 return 0;
3394}
3395
3396static int
3397locale_error_handler(const char *errors, int *surrogateescape)
3398{
3399 _Py_error_handler error_handler = get_error_handler(errors);
3400 switch (error_handler)
3401 {
3402 case _Py_ERROR_STRICT:
3403 *surrogateescape = 0;
3404 return 0;
3405 case _Py_ERROR_SURROGATEESCAPE:
3406 *surrogateescape = 1;
3407 return 0;
3408 default:
3409 PyErr_Format(PyExc_ValueError,
3410 "only 'strict' and 'surrogateescape' error handlers "
3411 "are supported, not '%s'",
3412 errors);
3413 return -1;
3414 }
3415}
3416
3417PyObject *
3418PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3419{
3420 Py_ssize_t wlen, wlen2;
3421 wchar_t *wstr;
3422 char *errmsg;
3423 PyObject *bytes, *reason, *exc;
3424 size_t error_pos, errlen;
3425 int surrogateescape;
3426
3427 if (locale_error_handler(errors, &surrogateescape) < 0)
3428 return NULL;
3429
3430 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3431 if (wstr == NULL)
3432 return NULL;
3433
3434 wlen2 = wcslen(wstr);
3435 if (wlen2 != wlen) {
3436 PyMem_Free(wstr);
3437 PyErr_SetString(PyExc_ValueError, "embedded null character");
3438 return NULL;
3439 }
3440
3441 if (surrogateescape) {
3442 /* "surrogateescape" error handler */
3443 char *str;
3444
3445 str = Py_EncodeLocale(wstr, &error_pos);
3446 if (str == NULL) {
3447 if (error_pos == (size_t)-1) {
3448 PyErr_NoMemory();
3449 PyMem_Free(wstr);
3450 return NULL;
3451 }
3452 else {
3453 goto encode_error;
3454 }
3455 }
3456 PyMem_Free(wstr);
3457
3458 bytes = PyBytes_FromString(str);
3459 PyMem_Free(str);
3460 }
3461 else {
3462 /* strict mode */
3463 size_t len, len2;
3464
3465 len = wcstombs(NULL, wstr, 0);
3466 if (len == (size_t)-1) {
3467 error_pos = (size_t)-1;
3468 goto encode_error;
3469 }
3470
3471 bytes = PyBytes_FromStringAndSize(NULL, len);
3472 if (bytes == NULL) {
3473 PyMem_Free(wstr);
3474 return NULL;
3475 }
3476
3477 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3478 if (len2 == (size_t)-1 || len2 > len) {
3479 Py_DECREF(bytes);
3480 error_pos = (size_t)-1;
3481 goto encode_error;
3482 }
3483 PyMem_Free(wstr);
3484 }
3485 return bytes;
3486
3487encode_error:
3488 errmsg = strerror(errno);
3489 assert(errmsg != NULL);
3490
3491 if (error_pos == (size_t)-1)
3492 error_pos = wcstombs_errorpos(wstr);
3493
3494 PyMem_Free(wstr);
3495
3496 wstr = Py_DecodeLocale(errmsg, &errlen);
3497 if (wstr != NULL) {
3498 reason = PyUnicode_FromWideChar(wstr, errlen);
3499 PyMem_RawFree(wstr);
3500 } else {
3501 errmsg = NULL;
3502 }
3503
3504 if (errmsg == NULL)
3505 reason = PyUnicode_FromString(
3506 "wcstombs() encountered an unencodable "
3507 "wide character");
3508 if (reason == NULL)
3509 return NULL;
3510
3511 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3512 "locale", unicode,
3513 (Py_ssize_t)error_pos,
3514 (Py_ssize_t)(error_pos+1),
3515 reason);
3516 Py_DECREF(reason);
3517 if (exc != NULL) {
3518 PyCodec_StrictErrors(exc);
3519 Py_DECREF(exc);
3520 }
3521 return NULL;
3522}
3523
3524PyObject *
3525PyUnicode_EncodeFSDefault(PyObject *unicode)
3526{
3527#if defined(__APPLE__)
3528 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
3529#else
3530 PyInterpreterState *interp = PyThreadState_GET()->interp;
3531 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3532 cannot use it to encode and decode filenames before it is loaded. Load
3533 the Python codec requires to encode at least its own filename. Use the C
3534 version of the locale codec until the codec registry is initialized and
3535 the Python codec is loaded.
3536
3537 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3538 cannot only rely on it: check also interp->fscodec_initialized for
3539 subinterpreters. */
3540 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3541 return PyUnicode_AsEncodedString(unicode,
3542 Py_FileSystemDefaultEncoding,
3543 Py_FileSystemDefaultEncodeErrors);
3544 }
3545 else {
3546 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
3547 }
3548#endif
3549}
3550
3551PyObject *
3552PyUnicode_AsEncodedString(PyObject *unicode,
3553 const char *encoding,
3554 const char *errors)
3555{
3556 PyObject *v;
3557 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3558
3559 if (!PyUnicode_Check(unicode)) {
3560 PyErr_BadArgument();
3561 return NULL;
3562 }
3563
3564 if (encoding == NULL) {
3565 return _PyUnicode_AsUTF8String(unicode, errors);
3566 }
3567
3568 /* Shortcuts for common default encodings */
3569 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3570 char *lower = buflower;
3571
3572 /* Fast paths */
3573 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3574 lower += 3;
3575 if (*lower == '_') {
3576 /* Match "utf8" and "utf_8" */
3577 lower++;
3578 }
3579
3580 if (lower[0] == '8' && lower[1] == 0) {
3581 return _PyUnicode_AsUTF8String(unicode, errors);
3582 }
3583 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3584 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3585 }
3586 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3587 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3588 }
3589 }
3590 else {
3591 if (strcmp(lower, "ascii") == 0
3592 || strcmp(lower, "us_ascii") == 0) {
3593 return _PyUnicode_AsASCIIString(unicode, errors);
3594 }
3595#ifdef MS_WINDOWS
3596 else if (strcmp(lower, "mbcs") == 0) {
3597 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3598 }
3599#endif
3600 else if (strcmp(lower, "latin1") == 0 ||
3601 strcmp(lower, "latin_1") == 0 ||
3602 strcmp(lower, "iso_8859_1") == 0 ||
3603 strcmp(lower, "iso8859_1") == 0) {
3604 return _PyUnicode_AsLatin1String(unicode, errors);
3605 }
3606 }
3607 }
3608
3609 /* Encode via the codec registry */
3610 v = _PyCodec_EncodeText(unicode, encoding, errors);
3611 if (v == NULL)
3612 return NULL;
3613
3614 /* The normal path */
3615 if (PyBytes_Check(v))
3616 return v;
3617
3618 /* If the codec returns a buffer, raise a warning and convert to bytes */
3619 if (PyByteArray_Check(v)) {
3620 int error;
3621 PyObject *b;
3622
3623 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3624 "encoder %s returned bytearray instead of bytes; "
3625 "use codecs.encode() to encode to arbitrary types",
3626 encoding);
3627 if (error) {
3628 Py_DECREF(v);
3629 return NULL;
3630 }
3631
3632 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3633 PyByteArray_GET_SIZE(v));
3634 Py_DECREF(v);
3635 return b;
3636 }
3637
3638 PyErr_Format(PyExc_TypeError,
3639 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3640 "use codecs.encode() to encode to arbitrary types",
3641 encoding,
3642 Py_TYPE(v)->tp_name);
3643 Py_DECREF(v);
3644 return NULL;
3645}
3646
3647PyObject *
3648PyUnicode_AsEncodedUnicode(PyObject *unicode,
3649 const char *encoding,
3650 const char *errors)
3651{
3652 PyObject *v;
3653
3654 if (!PyUnicode_Check(unicode)) {
3655 PyErr_BadArgument();
3656 goto onError;
3657 }
3658
3659 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3660 "PyUnicode_AsEncodedUnicode() is deprecated; "
3661 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3662 return NULL;
3663
3664 if (encoding == NULL)
3665 encoding = PyUnicode_GetDefaultEncoding();
3666
3667 /* Encode via the codec registry */
3668 v = PyCodec_Encode(unicode, encoding, errors);
3669 if (v == NULL)
3670 goto onError;
3671 if (!PyUnicode_Check(v)) {
3672 PyErr_Format(PyExc_TypeError,
3673 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3674 "use codecs.encode() to encode to arbitrary types",
3675 encoding,
3676 Py_TYPE(v)->tp_name);
3677 Py_DECREF(v);
3678 goto onError;
3679 }
3680 return v;
3681
3682 onError:
3683 return NULL;
3684}
3685
3686static size_t
3687mbstowcs_errorpos(const char *str, size_t len)
3688{
3689#ifdef HAVE_MBRTOWC
3690 const char *start = str;
3691 mbstate_t mbs;
3692 size_t converted;
3693 wchar_t ch;
3694
3695 memset(&mbs, 0, sizeof mbs);
3696 while (len)
3697 {
3698 converted = mbrtowc(&ch, str, len, &mbs);
3699 if (converted == 0)
3700 /* Reached end of string */
3701 break;
3702 if (converted == (size_t)-1 || converted == (size_t)-2) {
3703 /* Conversion error or incomplete character */
3704 return str - start;
3705 }
3706 else {
3707 str += converted;
3708 len -= converted;
3709 }
3710 }
3711 /* failed to find the undecodable byte sequence */
3712 return 0;
3713#endif
3714 return 0;
3715}
3716
3717PyObject*
3718PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3719 const char *errors)
3720{
3721 wchar_t smallbuf[256];
3722 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3723 wchar_t *wstr;
3724 size_t wlen, wlen2;
3725 PyObject *unicode;
3726 int surrogateescape;
3727 size_t error_pos, errlen;
3728 char *errmsg;
3729 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
3730
3731 if (locale_error_handler(errors, &surrogateescape) < 0)
3732 return NULL;
3733
3734 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3735 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3736 return NULL;
3737 }
3738
3739 if (surrogateescape) {
3740 /* "surrogateescape" error handler */
3741 wstr = Py_DecodeLocale(str, &wlen);
3742 if (wstr == NULL) {
3743 if (wlen == (size_t)-1)
3744 PyErr_NoMemory();
3745 else
3746 PyErr_SetFromErrno(PyExc_OSError);
3747 return NULL;
3748 }
3749
3750 unicode = PyUnicode_FromWideChar(wstr, wlen);
3751 PyMem_RawFree(wstr);
3752 }
3753 else {
3754 /* strict mode */
3755#ifndef HAVE_BROKEN_MBSTOWCS
3756 wlen = mbstowcs(NULL, str, 0);
3757#else
3758 wlen = len;
3759#endif
3760 if (wlen == (size_t)-1)
3761 goto decode_error;
3762 if (wlen+1 <= smallbuf_len) {
3763 wstr = smallbuf;
3764 }
3765 else {
3766 wstr = PyMem_New(wchar_t, wlen+1);
3767 if (!wstr)
3768 return PyErr_NoMemory();
3769 }
3770
3771 wlen2 = mbstowcs(wstr, str, wlen+1);
3772 if (wlen2 == (size_t)-1) {
3773 if (wstr != smallbuf)
3774 PyMem_Free(wstr);
3775 goto decode_error;
3776 }
3777#ifdef HAVE_BROKEN_MBSTOWCS
3778 assert(wlen2 == wlen);
3779#endif
3780 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3781 if (wstr != smallbuf)
3782 PyMem_Free(wstr);
3783 }
3784 return unicode;
3785
3786decode_error:
3787 errmsg = strerror(errno);
3788 assert(errmsg != NULL);
3789
3790 error_pos = mbstowcs_errorpos(str, len);
3791 wstr = Py_DecodeLocale(errmsg, &errlen);
3792 if (wstr != NULL) {
3793 reason = PyUnicode_FromWideChar(wstr, errlen);
3794 PyMem_RawFree(wstr);
3795 }
3796
3797 if (reason == NULL)
3798 reason = PyUnicode_FromString(
3799 "mbstowcs() encountered an invalid multibyte sequence");
3800 if (reason == NULL)
3801 return NULL;
3802
3803 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3804 "locale", str, len,
3805 (Py_ssize_t)error_pos,
3806 (Py_ssize_t)(error_pos+1),
3807 reason);
3808 Py_DECREF(reason);
3809 if (exc != NULL) {
3810 PyCodec_StrictErrors(exc);
3811 Py_DECREF(exc);
3812 }
3813 return NULL;
3814}
3815
3816PyObject*
3817PyUnicode_DecodeLocale(const char *str, const char *errors)
3818{
3819 Py_ssize_t size = (Py_ssize_t)strlen(str);
3820 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3821}
3822
3823
3824PyObject*
3825PyUnicode_DecodeFSDefault(const char *s) {
3826 Py_ssize_t size = (Py_ssize_t)strlen(s);
3827 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3828}
3829
3830PyObject*
3831PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3832{
3833#if defined(__APPLE__)
3834 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
3835#else
3836 PyInterpreterState *interp = PyThreadState_GET()->interp;
3837 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3838 cannot use it to encode and decode filenames before it is loaded. Load
3839 the Python codec requires to encode at least its own filename. Use the C
3840 version of the locale codec until the codec registry is initialized and
3841 the Python codec is loaded.
3842
3843 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3844 cannot only rely on it: check also interp->fscodec_initialized for
3845 subinterpreters. */
3846 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3847 return PyUnicode_Decode(s, size,
3848 Py_FileSystemDefaultEncoding,
3849 Py_FileSystemDefaultEncodeErrors);
3850 }
3851 else {
3852 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
3853 }
3854#endif
3855}
3856
3857
3858int
3859PyUnicode_FSConverter(PyObject* arg, void* addr)
3860{
3861 PyObject *path = NULL;
3862 PyObject *output = NULL;
3863 Py_ssize_t size;
3864 void *data;
3865 if (arg == NULL) {
3866 Py_DECREF(*(PyObject**)addr);
3867 *(PyObject**)addr = NULL;
3868 return 1;
3869 }
3870 path = PyOS_FSPath(arg);
3871 if (path == NULL) {
3872 return 0;
3873 }
3874 if (PyBytes_Check(path)) {
3875 output = path;
3876 }
3877 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3878 output = PyUnicode_EncodeFSDefault(path);
3879 Py_DECREF(path);
3880 if (!output) {
3881 return 0;
3882 }
3883 assert(PyBytes_Check(output));
3884 }
3885
3886 size = PyBytes_GET_SIZE(output);
3887 data = PyBytes_AS_STRING(output);
3888 if ((size_t)size != strlen(data)) {
3889 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3890 Py_DECREF(output);
3891 return 0;
3892 }
3893 *(PyObject**)addr = output;
3894 return Py_CLEANUP_SUPPORTED;
3895}
3896
3897
3898int
3899PyUnicode_FSDecoder(PyObject* arg, void* addr)
3900{
3901 int is_buffer = 0;
3902 PyObject *path = NULL;
3903 PyObject *output = NULL;
3904 if (arg == NULL) {
3905 Py_DECREF(*(PyObject**)addr);
3906 *(PyObject**)addr = NULL;
3907 return 1;
3908 }
3909
3910 is_buffer = PyObject_CheckBuffer(arg);
3911 if (!is_buffer) {
3912 path = PyOS_FSPath(arg);
3913 if (path == NULL) {
3914 return 0;
3915 }
3916 }
3917 else {
3918 path = arg;
3919 Py_INCREF(arg);
3920 }
3921
3922 if (PyUnicode_Check(path)) {
3923 if (PyUnicode_READY(path) == -1) {
3924 Py_DECREF(path);
3925 return 0;
3926 }
3927 output = path;
3928 }
3929 else if (PyBytes_Check(path) || is_buffer) {
3930 PyObject *path_bytes = NULL;
3931
3932 if (!PyBytes_Check(path) &&
3933 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3934 "path should be string, bytes, or os.PathLike, not %.200s",
3935 Py_TYPE(arg)->tp_name)) {
3936 Py_DECREF(path);
3937 return 0;
3938 }
3939 path_bytes = PyBytes_FromObject(path);
3940 Py_DECREF(path);
3941 if (!path_bytes) {
3942 return 0;
3943 }
3944 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3945 PyBytes_GET_SIZE(path_bytes));
3946 Py_DECREF(path_bytes);
3947 if (!output) {
3948 return 0;
3949 }
3950 }
3951 else {
3952 PyErr_Format(PyExc_TypeError,
3953 "path should be string, bytes, or os.PathLike, not %.200s",
3954 Py_TYPE(arg)->tp_name);
3955 Py_DECREF(path);
3956 return 0;
3957 }
3958 if (PyUnicode_READY(output) == -1) {
3959 Py_DECREF(output);
3960 return 0;
3961 }
3962 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3963 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3964 PyErr_SetString(PyExc_ValueError, "embedded null character");
3965 Py_DECREF(output);
3966 return 0;
3967 }
3968 *(PyObject**)addr = output;
3969 return Py_CLEANUP_SUPPORTED;
3970}
3971
3972
3973const char *
3974PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3975{
3976 PyObject *bytes;
3977
3978 if (!PyUnicode_Check(unicode)) {
3979 PyErr_BadArgument();
3980 return NULL;
3981 }
3982 if (PyUnicode_READY(unicode) == -1)
3983 return NULL;
3984
3985 if (PyUnicode_UTF8(unicode) == NULL) {
3986 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3987 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3988 if (bytes == NULL)
3989 return NULL;
3990 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3991 if (_PyUnicode_UTF8(unicode) == NULL) {
3992 PyErr_NoMemory();
3993 Py_DECREF(bytes);
3994 return NULL;
3995 }
3996 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3997 memcpy(_PyUnicode_UTF8(unicode),
3998 PyBytes_AS_STRING(bytes),
3999 _PyUnicode_UTF8_LENGTH(unicode) + 1);
4000 Py_DECREF(bytes);
4001 }
4002
4003 if (psize)
4004 *psize = PyUnicode_UTF8_LENGTH(unicode);
4005 return PyUnicode_UTF8(unicode);
4006}
4007
4008const char *
4009PyUnicode_AsUTF8(PyObject *unicode)
4010{
4011 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4012}
4013
4014Py_UNICODE *
4015PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4016{
4017 const unsigned char *one_byte;
4018#if SIZEOF_WCHAR_T == 4
4019 const Py_UCS2 *two_bytes;
4020#else
4021 const Py_UCS4 *four_bytes;
4022 const Py_UCS4 *ucs4_end;
4023 Py_ssize_t num_surrogates;
4024#endif
4025 wchar_t *w;
4026 wchar_t *wchar_end;
4027
4028 if (!PyUnicode_Check(unicode)) {
4029 PyErr_BadArgument();
4030 return NULL;
4031 }
4032 if (_PyUnicode_WSTR(unicode) == NULL) {
4033 /* Non-ASCII compact unicode object */
4034 assert(_PyUnicode_KIND(unicode) != 0);
4035 assert(PyUnicode_IS_READY(unicode));
4036
4037 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
4038#if SIZEOF_WCHAR_T == 2
4039 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4040 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
4041 num_surrogates = 0;
4042
4043 for (; four_bytes < ucs4_end; ++four_bytes) {
4044 if (*four_bytes > 0xFFFF)
4045 ++num_surrogates;
4046 }
4047
4048 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4049 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4050 if (!_PyUnicode_WSTR(unicode)) {
4051 PyErr_NoMemory();
4052 return NULL;
4053 }
4054 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
4055
4056 w = _PyUnicode_WSTR(unicode);
4057 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4058 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4059 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4060 if (*four_bytes > 0xFFFF) {
4061 assert(*four_bytes <= MAX_UNICODE);
4062 /* encode surrogate pair in this case */
4063 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4064 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
4065 }
4066 else
4067 *w = *four_bytes;
4068
4069 if (w > wchar_end) {
4070 assert(0 && "Miscalculated string end");
4071 }
4072 }
4073 *w = 0;
4074#else
4075 /* sizeof(wchar_t) == 4 */
4076 Py_FatalError("Impossible unicode object state, wstr and str "
4077 "should share memory already.");
4078 return NULL;
4079#endif
4080 }
4081 else {
4082 if ((size_t)_PyUnicode_LENGTH(unicode) >
4083 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4084 PyErr_NoMemory();
4085 return NULL;
4086 }
4087 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4088 (_PyUnicode_LENGTH(unicode) + 1));
4089 if (!_PyUnicode_WSTR(unicode)) {
4090 PyErr_NoMemory();
4091 return NULL;
4092 }
4093 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4094 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4095 w = _PyUnicode_WSTR(unicode);
4096 wchar_end = w + _PyUnicode_LENGTH(unicode);
4097
4098 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4099 one_byte = PyUnicode_1BYTE_DATA(unicode);
4100 for (; w < wchar_end; ++one_byte, ++w)
4101 *w = *one_byte;
4102 /* null-terminate the wstr */
4103 *w = 0;
4104 }
4105 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
4106#if SIZEOF_WCHAR_T == 4
4107 two_bytes = PyUnicode_2BYTE_DATA(unicode);
4108 for (; w < wchar_end; ++two_bytes, ++w)
4109 *w = *two_bytes;
4110 /* null-terminate the wstr */
4111 *w = 0;
4112#else
4113 /* sizeof(wchar_t) == 2 */
4114 PyObject_FREE(_PyUnicode_WSTR(unicode));
4115 _PyUnicode_WSTR(unicode) = NULL;
4116 Py_FatalError("Impossible unicode object state, wstr "
4117 "and str should share memory already.");
4118 return NULL;
4119#endif
4120 }
4121 else {
4122 assert(0 && "This should never happen.");
4123 }
4124 }
4125 }
4126 if (size != NULL)
4127 *size = PyUnicode_WSTR_LENGTH(unicode);
4128 return _PyUnicode_WSTR(unicode);
4129}
4130
4131Py_UNICODE *
4132PyUnicode_AsUnicode(PyObject *unicode)
4133{
4134 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4135}
4136
4137const Py_UNICODE *
4138_PyUnicode_AsUnicode(PyObject *unicode)
4139{
4140 Py_ssize_t size;
4141 const Py_UNICODE *wstr;
4142
4143 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4144 if (wstr && wcslen(wstr) != (size_t)size) {
4145 PyErr_SetString(PyExc_ValueError, "embedded null character");
4146 return NULL;
4147 }
4148 return wstr;
4149}
4150
4151
4152Py_ssize_t
4153PyUnicode_GetSize(PyObject *unicode)
4154{
4155 if (!PyUnicode_Check(unicode)) {
4156 PyErr_BadArgument();
4157 goto onError;
4158 }
4159 if (_PyUnicode_WSTR(unicode) == NULL) {
4160 if (PyUnicode_AsUnicode(unicode) == NULL)
4161 goto onError;
4162 }
4163 return PyUnicode_WSTR_LENGTH(unicode);
4164
4165 onError:
4166 return -1;
4167}
4168
4169Py_ssize_t
4170PyUnicode_GetLength(PyObject *unicode)
4171{
4172 if (!PyUnicode_Check(unicode)) {
4173 PyErr_BadArgument();
4174 return -1;
4175 }
4176 if (PyUnicode_READY(unicode) == -1)
4177 return -1;
4178 return PyUnicode_GET_LENGTH(unicode);
4179}
4180
4181Py_UCS4
4182PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4183{
4184 void *data;
4185 int kind;
4186
4187 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4188 PyErr_BadArgument();
4189 return (Py_UCS4)-1;
4190 }
4191 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4192 PyErr_SetString(PyExc_IndexError, "string index out of range");
4193 return (Py_UCS4)-1;
4194 }
4195 data = PyUnicode_DATA(unicode);
4196 kind = PyUnicode_KIND(unicode);
4197 return PyUnicode_READ(kind, data, index);
4198}
4199
4200int
4201PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4202{
4203 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4204 PyErr_BadArgument();
4205 return -1;
4206 }
4207 assert(PyUnicode_IS_READY(unicode));
4208 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4209 PyErr_SetString(PyExc_IndexError, "string index out of range");
4210 return -1;
4211 }
4212 if (unicode_check_modifiable(unicode))
4213 return -1;
4214 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4215 PyErr_SetString(PyExc_ValueError, "character out of range");
4216 return -1;
4217 }
4218 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4219 index, ch);
4220 return 0;
4221}
4222
4223const char *
4224PyUnicode_GetDefaultEncoding(void)
4225{
4226 return "utf-8";
4227}
4228
4229/* create or adjust a UnicodeDecodeError */
4230static void
4231make_decode_exception(PyObject **exceptionObject,
4232 const char *encoding,
4233 const char *input, Py_ssize_t length,
4234 Py_ssize_t startpos, Py_ssize_t endpos,
4235 const char *reason)
4236{
4237 if (*exceptionObject == NULL) {
4238 *exceptionObject = PyUnicodeDecodeError_Create(
4239 encoding, input, length, startpos, endpos, reason);
4240 }
4241 else {
4242 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4243 goto onError;
4244 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4245 goto onError;
4246 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4247 goto onError;
4248 }
4249 return;
4250
4251onError:
4252 Py_CLEAR(*exceptionObject);
4253}
4254
4255#ifdef MS_WINDOWS
4256/* error handling callback helper:
4257 build arguments, call the callback and check the arguments,
4258 if no exception occurred, copy the replacement to the output
4259 and adjust various state variables.
4260 return 0 on success, -1 on error
4261*/
4262
4263static int
4264unicode_decode_call_errorhandler_wchar(
4265 const char *errors, PyObject **errorHandler,
4266 const char *encoding, const char *reason,
4267 const char **input, const char **inend, Py_ssize_t *startinpos,
4268 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4269 PyObject **output, Py_ssize_t *outpos)
4270{
4271 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4272
4273 PyObject *restuple = NULL;
4274 PyObject *repunicode = NULL;
4275 Py_ssize_t outsize;
4276 Py_ssize_t insize;
4277 Py_ssize_t requiredsize;
4278 Py_ssize_t newpos;
4279 PyObject *inputobj = NULL;
4280 wchar_t *repwstr;
4281 Py_ssize_t repwlen;
4282
4283 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4284 outsize = _PyUnicode_WSTR_LENGTH(*output);
4285
4286 if (*errorHandler == NULL) {
4287 *errorHandler = PyCodec_LookupError(errors);
4288 if (*errorHandler == NULL)
4289 goto onError;
4290 }
4291
4292 make_decode_exception(exceptionObject,
4293 encoding,
4294 *input, *inend - *input,
4295 *startinpos, *endinpos,
4296 reason);
4297 if (*exceptionObject == NULL)
4298 goto onError;
4299
4300 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4301 if (restuple == NULL)
4302 goto onError;
4303 if (!PyTuple_Check(restuple)) {
4304 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4305 goto onError;
4306 }
4307 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4308 goto onError;
4309
4310 /* Copy back the bytes variables, which might have been modified by the
4311 callback */
4312 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4313 if (!inputobj)
4314 goto onError;
4315 *input = PyBytes_AS_STRING(inputobj);
4316 insize = PyBytes_GET_SIZE(inputobj);
4317 *inend = *input + insize;
4318 /* we can DECREF safely, as the exception has another reference,
4319 so the object won't go away. */
4320 Py_DECREF(inputobj);
4321
4322 if (newpos<0)
4323 newpos = insize+newpos;
4324 if (newpos<0 || newpos>insize) {
4325 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4326 goto onError;
4327 }
4328
4329 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4330 if (repwstr == NULL)
4331 goto onError;
4332 /* need more space? (at least enough for what we
4333 have+the replacement+the rest of the string (starting
4334 at the new input position), so we won't have to check space
4335 when there are no errors in the rest of the string) */
4336 requiredsize = *outpos;
4337 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4338 goto overflow;
4339 requiredsize += repwlen;
4340 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4341 goto overflow;
4342 requiredsize += insize - newpos;
4343 if (requiredsize > outsize) {
4344 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4345 requiredsize = 2*outsize;
4346 if (unicode_resize(output, requiredsize) < 0)
4347 goto onError;
4348 }
4349 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4350 *outpos += repwlen;
4351 *endinpos = newpos;
4352 *inptr = *input + newpos;
4353
4354 /* we made it! */
4355 Py_DECREF(restuple);
4356 return 0;
4357
4358 overflow:
4359 PyErr_SetString(PyExc_OverflowError,
4360 "decoded result is too long for a Python string");
4361
4362 onError:
4363 Py_XDECREF(restuple);
4364 return -1;
4365}
4366#endif /* MS_WINDOWS */
4367
4368static int
4369unicode_decode_call_errorhandler_writer(
4370 const char *errors, PyObject **errorHandler,
4371 const char *encoding, const char *reason,
4372 const char **input, const char **inend, Py_ssize_t *startinpos,
4373 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4374 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4375{
4376 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4377
4378 PyObject *restuple = NULL;
4379 PyObject *repunicode = NULL;
4380 Py_ssize_t insize;
4381 Py_ssize_t newpos;
4382 Py_ssize_t replen;
4383 PyObject *inputobj = NULL;
4384
4385 if (*errorHandler == NULL) {
4386 *errorHandler = PyCodec_LookupError(errors);
4387 if (*errorHandler == NULL)
4388 goto onError;
4389 }
4390
4391 make_decode_exception(exceptionObject,
4392 encoding,
4393 *input, *inend - *input,
4394 *startinpos, *endinpos,
4395 reason);
4396 if (*exceptionObject == NULL)
4397 goto onError;
4398
4399 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4400 if (restuple == NULL)
4401 goto onError;
4402 if (!PyTuple_Check(restuple)) {
4403 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4404 goto onError;
4405 }
4406 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4407 goto onError;
4408
4409 /* Copy back the bytes variables, which might have been modified by the
4410 callback */
4411 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4412 if (!inputobj)
4413 goto onError;
4414 *input = PyBytes_AS_STRING(inputobj);
4415 insize = PyBytes_GET_SIZE(inputobj);
4416 *inend = *input + insize;
4417 /* we can DECREF safely, as the exception has another reference,
4418 so the object won't go away. */
4419 Py_DECREF(inputobj);
4420
4421 if (newpos<0)
4422 newpos = insize+newpos;
4423 if (newpos<0 || newpos>insize) {
4424 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4425 goto onError;
4426 }
4427
4428 replen = PyUnicode_GET_LENGTH(repunicode);
4429 if (replen > 1) {
4430 writer->min_length += replen - 1;
4431 writer->overallocate = 1;
4432 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4433 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4434 goto onError;
4435 }
4436 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4437 goto onError;
4438
4439 *endinpos = newpos;
4440 *inptr = *input + newpos;
4441
4442 /* we made it! */
4443 Py_DECREF(restuple);
4444 return 0;
4445
4446 onError:
4447 Py_XDECREF(restuple);
4448 return -1;
4449}
4450
4451/* --- UTF-7 Codec -------------------------------------------------------- */
4452
4453/* See RFC2152 for details. We encode conservatively and decode liberally. */
4454
4455/* Three simple macros defining base-64. */
4456
4457/* Is c a base-64 character? */
4458
4459#define IS_BASE64(c) \
4460 (((c) >= 'A' && (c) <= 'Z') || \
4461 ((c) >= 'a' && (c) <= 'z') || \
4462 ((c) >= '0' && (c) <= '9') || \
4463 (c) == '+' || (c) == '/')
4464
4465/* given that c is a base-64 character, what is its base-64 value? */
4466
4467#define FROM_BASE64(c) \
4468 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4469 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4470 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4471 (c) == '+' ? 62 : 63)
4472
4473/* What is the base-64 character of the bottom 6 bits of n? */
4474
4475#define TO_BASE64(n) \
4476 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4477
4478/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4479 * decoded as itself. We are permissive on decoding; the only ASCII
4480 * byte not decoding to itself is the + which begins a base64
4481 * string. */
4482
4483#define DECODE_DIRECT(c) \
4484 ((c) <= 127 && (c) != '+')
4485
4486/* The UTF-7 encoder treats ASCII characters differently according to
4487 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4488 * the above). See RFC2152. This array identifies these different
4489 * sets:
4490 * 0 : "Set D"
4491 * alphanumeric and '(),-./:?
4492 * 1 : "Set O"
4493 * !"#$%&*;<=>@[]^_`{|}
4494 * 2 : "whitespace"
4495 * ht nl cr sp
4496 * 3 : special (must be base64 encoded)
4497 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4498 */
4499
4500static
4501char utf7_category[128] = {
4502/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4503 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4504/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4505 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4506/* sp ! " # $ % & ' ( ) * + , - . / */
4507 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4508/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4509 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4510/* @ A B C D E F G H I J K L M N O */
4511 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4512/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4513 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4514/* ` a b c d e f g h i j k l m n o */
4515 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4516/* p q r s t u v w x y z { | } ~ del */
4517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4518};
4519
4520/* ENCODE_DIRECT: this character should be encoded as itself. The
4521 * answer depends on whether we are encoding set O as itself, and also
4522 * on whether we are encoding whitespace as itself. RFC2152 makes it
4523 * clear that the answers to these questions vary between
4524 * applications, so this code needs to be flexible. */
4525
4526#define ENCODE_DIRECT(c, directO, directWS) \
4527 ((c) < 128 && (c) > 0 && \
4528 ((utf7_category[(c)] == 0) || \
4529 (directWS && (utf7_category[(c)] == 2)) || \
4530 (directO && (utf7_category[(c)] == 1))))
4531
4532PyObject *
4533PyUnicode_DecodeUTF7(const char *s,
4534 Py_ssize_t size,
4535 const char *errors)
4536{
4537 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4538}
4539
4540/* The decoder. The only state we preserve is our read position,
4541 * i.e. how many characters we have consumed. So if we end in the
4542 * middle of a shift sequence we have to back off the read position
4543 * and the output to the beginning of the sequence, otherwise we lose
4544 * all the shift state (seen bits, number of bits seen, high
4545 * surrogate). */
4546
4547PyObject *
4548PyUnicode_DecodeUTF7Stateful(const char *s,
4549 Py_ssize_t size,
4550 const char *errors,
4551 Py_ssize_t *consumed)
4552{
4553 const char *starts = s;
4554 Py_ssize_t startinpos;
4555 Py_ssize_t endinpos;
4556 const char *e;
4557 _PyUnicodeWriter writer;
4558 const char *errmsg = "";
4559 int inShift = 0;
4560 Py_ssize_t shiftOutStart;
4561 unsigned int base64bits = 0;
4562 unsigned long base64buffer = 0;
4563 Py_UCS4 surrogate = 0;
4564 PyObject *errorHandler = NULL;
4565 PyObject *exc = NULL;
4566
4567 if (size == 0) {
4568 if (consumed)
4569 *consumed = 0;
4570 _Py_RETURN_UNICODE_EMPTY();
4571 }
4572
4573 /* Start off assuming it's all ASCII. Widen later as necessary. */
4574 _PyUnicodeWriter_Init(&writer);
4575 writer.min_length = size;
4576
4577 shiftOutStart = 0;
4578 e = s + size;
4579
4580 while (s < e) {
4581 Py_UCS4 ch;
4582 restart:
4583 ch = (unsigned char) *s;
4584
4585 if (inShift) { /* in a base-64 section */
4586 if (IS_BASE64(ch)) { /* consume a base-64 character */
4587 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4588 base64bits += 6;
4589 s++;
4590 if (base64bits >= 16) {
4591 /* we have enough bits for a UTF-16 value */
4592 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4593 base64bits -= 16;
4594 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4595 assert(outCh <= 0xffff);
4596 if (surrogate) {
4597 /* expecting a second surrogate */
4598 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4599 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4600 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4601 goto onError;
4602 surrogate = 0;
4603 continue;
4604 }
4605 else {
4606 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4607 goto onError;
4608 surrogate = 0;
4609 }
4610 }
4611 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4612 /* first surrogate */
4613 surrogate = outCh;
4614 }
4615 else {
4616 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4617 goto onError;
4618 }
4619 }
4620 }
4621 else { /* now leaving a base-64 section */
4622 inShift = 0;
4623 if (base64bits > 0) { /* left-over bits */
4624 if (base64bits >= 6) {
4625 /* We've seen at least one base-64 character */
4626 s++;
4627 errmsg = "partial character in shift sequence";
4628 goto utf7Error;
4629 }
4630 else {
4631 /* Some bits remain; they should be zero */
4632 if (base64buffer != 0) {
4633 s++;
4634 errmsg = "non-zero padding bits in shift sequence";
4635 goto utf7Error;
4636 }
4637 }
4638 }
4639 if (surrogate && DECODE_DIRECT(ch)) {
4640 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4641 goto onError;
4642 }
4643 surrogate = 0;
4644 if (ch == '-') {
4645 /* '-' is absorbed; other terminating
4646 characters are preserved */
4647 s++;
4648 }
4649 }
4650 }
4651 else if ( ch == '+' ) {
4652 startinpos = s-starts;
4653 s++; /* consume '+' */
4654 if (s < e && *s == '-') { /* '+-' encodes '+' */
4655 s++;
4656 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4657 goto onError;
4658 }
4659 else { /* begin base64-encoded section */
4660 inShift = 1;
4661 surrogate = 0;
4662 shiftOutStart = writer.pos;
4663 base64bits = 0;
4664 base64buffer = 0;
4665 }
4666 }
4667 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4668 s++;
4669 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4670 goto onError;
4671 }
4672 else {
4673 startinpos = s-starts;
4674 s++;
4675 errmsg = "unexpected special character";
4676 goto utf7Error;
4677 }
4678 continue;
4679utf7Error:
4680 endinpos = s-starts;
4681 if (unicode_decode_call_errorhandler_writer(
4682 errors, &errorHandler,
4683 "utf7", errmsg,
4684 &starts, &e, &startinpos, &endinpos, &exc, &s,
4685 &writer))
4686 goto onError;
4687 }
4688
4689 /* end of string */
4690
4691 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4692 /* if we're in an inconsistent state, that's an error */
4693 inShift = 0;
4694 if (surrogate ||
4695 (base64bits >= 6) ||
4696 (base64bits > 0 && base64buffer != 0)) {
4697 endinpos = size;
4698 if (unicode_decode_call_errorhandler_writer(
4699 errors, &errorHandler,
4700 "utf7", "unterminated shift sequence",
4701 &starts, &e, &startinpos, &endinpos, &exc, &s,
4702 &writer))
4703 goto onError;
4704 if (s < e)
4705 goto restart;
4706 }
4707 }
4708
4709 /* return state */
4710 if (consumed) {
4711 if (inShift) {
4712 *consumed = startinpos;
4713 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4714 PyObject *result = PyUnicode_FromKindAndData(
4715 writer.kind, writer.data, shiftOutStart);
4716 Py_XDECREF(errorHandler);
4717 Py_XDECREF(exc);
4718 _PyUnicodeWriter_Dealloc(&writer);
4719 return result;
4720 }
4721 writer.pos = shiftOutStart; /* back off output */
4722 }
4723 else {
4724 *consumed = s-starts;
4725 }
4726 }
4727
4728 Py_XDECREF(errorHandler);
4729 Py_XDECREF(exc);
4730 return _PyUnicodeWriter_Finish(&writer);
4731
4732 onError:
4733 Py_XDECREF(errorHandler);
4734 Py_XDECREF(exc);
4735 _PyUnicodeWriter_Dealloc(&writer);
4736 return NULL;
4737}
4738
4739
4740PyObject *
4741_PyUnicode_EncodeUTF7(PyObject *str,
4742 int base64SetO,
4743 int base64WhiteSpace,
4744 const char *errors)
4745{
4746 int kind;
4747 void *data;
4748 Py_ssize_t len;
4749 PyObject *v;
4750 int inShift = 0;
4751 Py_ssize_t i;
4752 unsigned int base64bits = 0;
4753 unsigned long base64buffer = 0;
4754 char * out;
4755 char * start;
4756
4757 if (PyUnicode_READY(str) == -1)
4758 return NULL;
4759 kind = PyUnicode_KIND(str);
4760 data = PyUnicode_DATA(str);
4761 len = PyUnicode_GET_LENGTH(str);
4762
4763 if (len == 0)
4764 return PyBytes_FromStringAndSize(NULL, 0);
4765
4766 /* It might be possible to tighten this worst case */
4767 if (len > PY_SSIZE_T_MAX / 8)
4768 return PyErr_NoMemory();
4769 v = PyBytes_FromStringAndSize(NULL, len * 8);
4770 if (v == NULL)
4771 return NULL;
4772
4773 start = out = PyBytes_AS_STRING(v);
4774 for (i = 0; i < len; ++i) {
4775 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4776
4777 if (inShift) {
4778 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4779 /* shifting out */
4780 if (base64bits) { /* output remaining bits */
4781 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4782 base64buffer = 0;
4783 base64bits = 0;
4784 }
4785 inShift = 0;
4786 /* Characters not in the BASE64 set implicitly unshift the sequence
4787 so no '-' is required, except if the character is itself a '-' */
4788 if (IS_BASE64(ch) || ch == '-') {
4789 *out++ = '-';
4790 }
4791 *out++ = (char) ch;
4792 }
4793 else {
4794 goto encode_char;
4795 }
4796 }
4797 else { /* not in a shift sequence */
4798 if (ch == '+') {
4799 *out++ = '+';
4800 *out++ = '-';
4801 }
4802 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4803 *out++ = (char) ch;
4804 }
4805 else {
4806 *out++ = '+';
4807 inShift = 1;
4808 goto encode_char;
4809 }
4810 }
4811 continue;
4812encode_char:
4813 if (ch >= 0x10000) {
4814 assert(ch <= MAX_UNICODE);
4815
4816 /* code first surrogate */
4817 base64bits += 16;
4818 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4819 while (base64bits >= 6) {
4820 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4821 base64bits -= 6;
4822 }
4823 /* prepare second surrogate */
4824 ch = Py_UNICODE_LOW_SURROGATE(ch);
4825 }
4826 base64bits += 16;
4827 base64buffer = (base64buffer << 16) | ch;
4828 while (base64bits >= 6) {
4829 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4830 base64bits -= 6;
4831 }
4832 }
4833 if (base64bits)
4834 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4835 if (inShift)
4836 *out++ = '-';
4837 if (_PyBytes_Resize(&v, out - start) < 0)
4838 return NULL;
4839 return v;
4840}
4841PyObject *
4842PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4843 Py_ssize_t size,
4844 int base64SetO,
4845 int base64WhiteSpace,
4846 const char *errors)
4847{
4848 PyObject *result;
4849 PyObject *tmp = PyUnicode_FromWideChar(s, size);
4850 if (tmp == NULL)
4851 return NULL;
4852 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4853 base64WhiteSpace, errors);
4854 Py_DECREF(tmp);
4855 return result;
4856}
4857
4858#undef IS_BASE64
4859#undef FROM_BASE64
4860#undef TO_BASE64
4861#undef DECODE_DIRECT
4862#undef ENCODE_DIRECT
4863
4864/* --- UTF-8 Codec -------------------------------------------------------- */
4865
4866PyObject *
4867PyUnicode_DecodeUTF8(const char *s,
4868 Py_ssize_t size,
4869 const char *errors)
4870{
4871 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4872}
4873
4874#include "stringlib/asciilib.h"
4875#include "stringlib/codecs.h"
4876#include "stringlib/undef.h"
4877
4878#include "stringlib/ucs1lib.h"
4879#include "stringlib/codecs.h"
4880#include "stringlib/undef.h"
4881
4882#include "stringlib/ucs2lib.h"
4883#include "stringlib/codecs.h"
4884#include "stringlib/undef.h"
4885
4886#include "stringlib/ucs4lib.h"
4887#include "stringlib/codecs.h"
4888#include "stringlib/undef.h"
4889
4890/* Mask to quickly check whether a C 'long' contains a
4891 non-ASCII, UTF8-encoded char. */
4892#if (SIZEOF_LONG == 8)
4893# define ASCII_CHAR_MASK 0x8080808080808080UL
4894#elif (SIZEOF_LONG == 4)
4895# define ASCII_CHAR_MASK 0x80808080UL
4896#else
4897# error C 'long' size should be either 4 or 8!
4898#endif
4899
4900static Py_ssize_t
4901ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4902{
4903 const char *p = start;
4904 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4905
4906 /*
4907 * Issue #17237: m68k is a bit different from most architectures in
4908 * that objects do not use "natural alignment" - for example, int and
4909 * long are only aligned at 2-byte boundaries. Therefore the assert()
4910 * won't work; also, tests have shown that skipping the "optimised
4911 * version" will even speed up m68k.
4912 */
4913#if !defined(__m68k__)
4914#if SIZEOF_LONG <= SIZEOF_VOID_P
4915 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4916 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4917 /* Fast path, see in STRINGLIB(utf8_decode) for
4918 an explanation. */
4919 /* Help allocation */
4920 const char *_p = p;
4921 Py_UCS1 * q = dest;
4922 while (_p < aligned_end) {
4923 unsigned long value = *(const unsigned long *) _p;
4924 if (value & ASCII_CHAR_MASK)
4925 break;
4926 *((unsigned long *)q) = value;
4927 _p += SIZEOF_LONG;
4928 q += SIZEOF_LONG;
4929 }
4930 p = _p;
4931 while (p < end) {
4932 if ((unsigned char)*p & 0x80)
4933 break;
4934 *q++ = *p++;
4935 }
4936 return p - start;
4937 }
4938#endif
4939#endif
4940 while (p < end) {
4941 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4942 for an explanation. */
4943 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4944 /* Help allocation */
4945 const char *_p = p;
4946 while (_p < aligned_end) {
4947 unsigned long value = *(unsigned long *) _p;
4948 if (value & ASCII_CHAR_MASK)
4949 break;
4950 _p += SIZEOF_LONG;
4951 }
4952 p = _p;
4953 if (_p == end)
4954 break;
4955 }
4956 if ((unsigned char)*p & 0x80)
4957 break;
4958 ++p;
4959 }
4960 memcpy(dest, start, p - start);
4961 return p - start;
4962}
4963
4964PyObject *
4965PyUnicode_DecodeUTF8Stateful(const char *s,
4966 Py_ssize_t size,
4967 const char *errors,
4968 Py_ssize_t *consumed)
4969{
4970 _PyUnicodeWriter writer;
4971 const char *starts = s;
4972 const char *end = s + size;
4973
4974 Py_ssize_t startinpos;
4975 Py_ssize_t endinpos;
4976 const char *errmsg = "";
4977 PyObject *error_handler_obj = NULL;
4978 PyObject *exc = NULL;
4979 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
4980
4981 if (size == 0) {
4982 if (consumed)
4983 *consumed = 0;
4984 _Py_RETURN_UNICODE_EMPTY();
4985 }
4986
4987 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4988 if (size == 1 && (unsigned char)s[0] < 128) {
4989 if (consumed)
4990 *consumed = 1;
4991 return get_latin1_char((unsigned char)s[0]);
4992 }
4993
4994 _PyUnicodeWriter_Init(&writer);
4995 writer.min_length = size;
4996 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4997 goto onError;
4998
4999 writer.pos = ascii_decode(s, end, writer.data);
5000 s += writer.pos;
5001 while (s < end) {
5002 Py_UCS4 ch;
5003 int kind = writer.kind;
5004
5005 if (kind == PyUnicode_1BYTE_KIND) {
5006 if (PyUnicode_IS_ASCII(writer.buffer))
5007 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5008 else
5009 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5010 } else if (kind == PyUnicode_2BYTE_KIND) {
5011 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5012 } else {
5013 assert(kind == PyUnicode_4BYTE_KIND);
5014 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5015 }
5016
5017 switch (ch) {
5018 case 0:
5019 if (s == end || consumed)
5020 goto End;
5021 errmsg = "unexpected end of data";
5022 startinpos = s - starts;
5023 endinpos = end - starts;
5024 break;
5025 case 1:
5026 errmsg = "invalid start byte";
5027 startinpos = s - starts;
5028 endinpos = startinpos + 1;
5029 break;
5030 case 2:
5031 case 3:
5032 case 4:
5033 errmsg = "invalid continuation byte";
5034 startinpos = s - starts;
5035 endinpos = startinpos + ch - 1;
5036 break;
5037 default:
5038 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5039 goto onError;
5040 continue;
5041 }
5042
5043 if (error_handler == _Py_ERROR_UNKNOWN)
5044 error_handler = get_error_handler(errors);
5045
5046 switch (error_handler) {
5047 case _Py_ERROR_IGNORE:
5048 s += (endinpos - startinpos);
5049 break;
5050
5051 case _Py_ERROR_REPLACE:
5052 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5053 goto onError;
5054 s += (endinpos - startinpos);
5055 break;
5056
5057 case _Py_ERROR_SURROGATEESCAPE:
5058 {
5059 Py_ssize_t i;
5060
5061 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5062 goto onError;
5063 for (i=startinpos; i<endinpos; i++) {
5064 ch = (Py_UCS4)(unsigned char)(starts[i]);
5065 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5066 ch + 0xdc00);
5067 writer.pos++;
5068 }
5069 s += (endinpos - startinpos);
5070 break;
5071 }
5072
5073 default:
5074 if (unicode_decode_call_errorhandler_writer(
5075 errors, &error_handler_obj,
5076 "utf-8", errmsg,
5077 &starts, &end, &startinpos, &endinpos, &exc, &s,
5078 &writer))
5079 goto onError;
5080 }
5081 }
5082
5083End:
5084 if (consumed)
5085 *consumed = s - starts;
5086
5087 Py_XDECREF(error_handler_obj);
5088 Py_XDECREF(exc);
5089 return _PyUnicodeWriter_Finish(&writer);
5090
5091onError:
5092 Py_XDECREF(error_handler_obj);
5093 Py_XDECREF(exc);
5094 _PyUnicodeWriter_Dealloc(&writer);
5095 return NULL;
5096}
5097
5098#if defined(__APPLE__) || defined(__ANDROID__)
5099
5100/* Simplified UTF-8 decoder using surrogateescape error handler,
5101 used to decode the command line arguments on Mac OS X and Android.
5102
5103 Return a pointer to a newly allocated wide character string (use
5104 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
5105
5106wchar_t*
5107_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5108{
5109 const char *e;
5110 wchar_t *unicode;
5111 Py_ssize_t outpos;
5112
5113 /* Note: size will always be longer than the resulting Unicode
5114 character count */
5115 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
5116 return NULL;
5117 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5118 if (!unicode)
5119 return NULL;
5120
5121 /* Unpack UTF-8 encoded data */
5122 e = s + size;
5123 outpos = 0;
5124 while (s < e) {
5125 Py_UCS4 ch;
5126#if SIZEOF_WCHAR_T == 4
5127 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5128#else
5129 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5130#endif
5131 if (ch > 0xFF) {
5132#if SIZEOF_WCHAR_T == 4
5133 assert(0);
5134#else
5135 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5136 /* compute and append the two surrogates: */
5137 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5138 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5139#endif
5140 }
5141 else {
5142 if (!ch && s == e)
5143 break;
5144 /* surrogateescape */
5145 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5146 }
5147 }
5148 unicode[outpos] = L'\0';
5149 return unicode;
5150}
5151
5152#endif /* __APPLE__ or __ANDROID__ */
5153
5154/* Primary internal function which creates utf8 encoded bytes objects.
5155
5156 Allocation strategy: if the string is short, convert into a stack buffer
5157 and allocate exactly as much space needed at the end. Else allocate the
5158 maximum possible needed (4 result bytes per Unicode character), and return
5159 the excess memory at the end.
5160*/
5161PyObject *
5162_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5163{
5164 enum PyUnicode_Kind kind;
5165 void *data;
5166 Py_ssize_t size;
5167
5168 if (!PyUnicode_Check(unicode)) {
5169 PyErr_BadArgument();
5170 return NULL;
5171 }
5172
5173 if (PyUnicode_READY(unicode) == -1)
5174 return NULL;
5175
5176 if (PyUnicode_UTF8(unicode))
5177 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5178 PyUnicode_UTF8_LENGTH(unicode));
5179
5180 kind = PyUnicode_KIND(unicode);
5181 data = PyUnicode_DATA(unicode);
5182 size = PyUnicode_GET_LENGTH(unicode);
5183
5184 switch (kind) {
5185 default:
5186 assert(0);
5187 case PyUnicode_1BYTE_KIND:
5188 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5189 assert(!PyUnicode_IS_ASCII(unicode));
5190 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5191 case PyUnicode_2BYTE_KIND:
5192 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5193 case PyUnicode_4BYTE_KIND:
5194 return ucs4lib_utf8_encoder(unicode, data, size, errors);
5195 }
5196}
5197
5198PyObject *
5199PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5200 Py_ssize_t size,
5201 const char *errors)
5202{
5203 PyObject *v, *unicode;
5204
5205 unicode = PyUnicode_FromWideChar(s, size);
5206 if (unicode == NULL)
5207 return NULL;
5208 v = _PyUnicode_AsUTF8String(unicode, errors);
5209 Py_DECREF(unicode);
5210 return v;
5211}
5212
5213PyObject *
5214PyUnicode_AsUTF8String(PyObject *unicode)
5215{
5216 return _PyUnicode_AsUTF8String(unicode, NULL);
5217}
5218
5219/* --- UTF-32 Codec ------------------------------------------------------- */
5220
5221PyObject *
5222PyUnicode_DecodeUTF32(const char *s,
5223 Py_ssize_t size,
5224 const char *errors,
5225 int *byteorder)
5226{
5227 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5228}
5229
5230PyObject *
5231PyUnicode_DecodeUTF32Stateful(const char *s,
5232 Py_ssize_t size,
5233 const char *errors,
5234 int *byteorder,
5235 Py_ssize_t *consumed)
5236{
5237 const char *starts = s;
5238 Py_ssize_t startinpos;
5239 Py_ssize_t endinpos;
5240 _PyUnicodeWriter writer;
5241 const unsigned char *q, *e;
5242 int le, bo = 0; /* assume native ordering by default */
5243 const char *encoding;
5244 const char *errmsg = "";
5245 PyObject *errorHandler = NULL;
5246 PyObject *exc = NULL;
5247
5248 q = (unsigned char *)s;
5249 e = q + size;
5250
5251 if (byteorder)
5252 bo = *byteorder;
5253
5254 /* Check for BOM marks (U+FEFF) in the input and adjust current
5255 byte order setting accordingly. In native mode, the leading BOM
5256 mark is skipped, in all other modes, it is copied to the output
5257 stream as-is (giving a ZWNBSP character). */
5258 if (bo == 0 && size >= 4) {
5259 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5260 if (bom == 0x0000FEFF) {
5261 bo = -1;
5262 q += 4;
5263 }
5264 else if (bom == 0xFFFE0000) {
5265 bo = 1;
5266 q += 4;
5267 }
5268 if (byteorder)
5269 *byteorder = bo;
5270 }
5271
5272 if (q == e) {
5273 if (consumed)
5274 *consumed = size;
5275 _Py_RETURN_UNICODE_EMPTY();
5276 }
5277
5278#ifdef WORDS_BIGENDIAN
5279 le = bo < 0;
5280#else
5281 le = bo <= 0;
5282#endif
5283 encoding = le ? "utf-32-le" : "utf-32-be";
5284
5285 _PyUnicodeWriter_Init(&writer);
5286 writer.min_length = (e - q + 3) / 4;
5287 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5288 goto onError;
5289
5290 while (1) {
5291 Py_UCS4 ch = 0;
5292 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5293
5294 if (e - q >= 4) {
5295 enum PyUnicode_Kind kind = writer.kind;
5296 void *data = writer.data;
5297 const unsigned char *last = e - 4;
5298 Py_ssize_t pos = writer.pos;
5299 if (le) {
5300 do {
5301 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5302 if (ch > maxch)
5303 break;
5304 if (kind != PyUnicode_1BYTE_KIND &&
5305 Py_UNICODE_IS_SURROGATE(ch))
5306 break;
5307 PyUnicode_WRITE(kind, data, pos++, ch);
5308 q += 4;
5309 } while (q <= last);
5310 }
5311 else {
5312 do {
5313 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5314 if (ch > maxch)
5315 break;
5316 if (kind != PyUnicode_1BYTE_KIND &&
5317 Py_UNICODE_IS_SURROGATE(ch))
5318 break;
5319 PyUnicode_WRITE(kind, data, pos++, ch);
5320 q += 4;
5321 } while (q <= last);
5322 }
5323 writer.pos = pos;
5324 }
5325
5326 if (Py_UNICODE_IS_SURROGATE(ch)) {
5327 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5328 startinpos = ((const char *)q) - starts;
5329 endinpos = startinpos + 4;
5330 }
5331 else if (ch <= maxch) {
5332 if (q == e || consumed)
5333 break;
5334 /* remaining bytes at the end? (size should be divisible by 4) */
5335 errmsg = "truncated data";
5336 startinpos = ((const char *)q) - starts;
5337 endinpos = ((const char *)e) - starts;
5338 }
5339 else {
5340 if (ch < 0x110000) {
5341 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5342 goto onError;
5343 q += 4;
5344 continue;
5345 }
5346 errmsg = "code point not in range(0x110000)";
5347 startinpos = ((const char *)q) - starts;
5348 endinpos = startinpos + 4;
5349 }
5350
5351 /* The remaining input chars are ignored if the callback
5352 chooses to skip the input */
5353 if (unicode_decode_call_errorhandler_writer(
5354 errors, &errorHandler,
5355 encoding, errmsg,
5356 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5357 &writer))
5358 goto onError;
5359 }
5360
5361 if (consumed)
5362 *consumed = (const char *)q-starts;
5363
5364 Py_XDECREF(errorHandler);
5365 Py_XDECREF(exc);
5366 return _PyUnicodeWriter_Finish(&writer);
5367
5368 onError:
5369 _PyUnicodeWriter_Dealloc(&writer);
5370 Py_XDECREF(errorHandler);
5371 Py_XDECREF(exc);
5372 return NULL;
5373}
5374
5375PyObject *
5376_PyUnicode_EncodeUTF32(PyObject *str,
5377 const char *errors,
5378 int byteorder)
5379{
5380 enum PyUnicode_Kind kind;
5381 const void *data;
5382 Py_ssize_t len;
5383 PyObject *v;
5384 uint32_t *out;
5385#if PY_LITTLE_ENDIAN
5386 int native_ordering = byteorder <= 0;
5387#else
5388 int native_ordering = byteorder >= 0;
5389#endif
5390 const char *encoding;
5391 Py_ssize_t nsize, pos;
5392 PyObject *errorHandler = NULL;
5393 PyObject *exc = NULL;
5394 PyObject *rep = NULL;
5395
5396 if (!PyUnicode_Check(str)) {
5397 PyErr_BadArgument();
5398 return NULL;
5399 }
5400 if (PyUnicode_READY(str) == -1)
5401 return NULL;
5402 kind = PyUnicode_KIND(str);
5403 data = PyUnicode_DATA(str);
5404 len = PyUnicode_GET_LENGTH(str);
5405
5406 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5407 return PyErr_NoMemory();
5408 nsize = len + (byteorder == 0);
5409 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5410 if (v == NULL)
5411 return NULL;
5412
5413 /* output buffer is 4-bytes aligned */
5414 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5415 out = (uint32_t *)PyBytes_AS_STRING(v);
5416 if (byteorder == 0)
5417 *out++ = 0xFEFF;
5418 if (len == 0)
5419 goto done;
5420
5421 if (byteorder == -1)
5422 encoding = "utf-32-le";
5423 else if (byteorder == 1)
5424 encoding = "utf-32-be";
5425 else
5426 encoding = "utf-32";
5427
5428 if (kind == PyUnicode_1BYTE_KIND) {
5429 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5430 goto done;
5431 }
5432
5433 pos = 0;
5434 while (pos < len) {
5435 Py_ssize_t repsize, moreunits;
5436
5437 if (kind == PyUnicode_2BYTE_KIND) {
5438 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5439 &out, native_ordering);
5440 }
5441 else {
5442 assert(kind == PyUnicode_4BYTE_KIND);
5443 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5444 &out, native_ordering);
5445 }
5446 if (pos == len)
5447 break;
5448
5449 rep = unicode_encode_call_errorhandler(
5450 errors, &errorHandler,
5451 encoding, "surrogates not allowed",
5452 str, &exc, pos, pos + 1, &pos);
5453 if (!rep)
5454 goto error;
5455
5456 if (PyBytes_Check(rep)) {
5457 repsize = PyBytes_GET_SIZE(rep);
5458 if (repsize & 3) {
5459 raise_encode_exception(&exc, encoding,
5460 str, pos - 1, pos,
5461 "surrogates not allowed");
5462 goto error;
5463 }
5464 moreunits = repsize / 4;
5465 }
5466 else {
5467 assert(PyUnicode_Check(rep));
5468 if (PyUnicode_READY(rep) < 0)
5469 goto error;
5470 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5471 if (!PyUnicode_IS_ASCII(rep)) {
5472 raise_encode_exception(&exc, encoding,
5473 str, pos - 1, pos,
5474 "surrogates not allowed");
5475 goto error;
5476 }
5477 }
5478
5479 /* four bytes are reserved for each surrogate */
5480 if (moreunits > 1) {
5481 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5482 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5483 /* integer overflow */
5484 PyErr_NoMemory();
5485 goto error;
5486 }
5487 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5488 goto error;
5489 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5490 }
5491
5492 if (PyBytes_Check(rep)) {
5493 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5494 out += moreunits;
5495 } else /* rep is unicode */ {
5496 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5497 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5498 &out, native_ordering);
5499 }
5500
5501 Py_CLEAR(rep);
5502 }
5503
5504 /* Cut back to size actually needed. This is necessary for, for example,
5505 encoding of a string containing isolated surrogates and the 'ignore'
5506 handler is used. */
5507 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5508 if (nsize != PyBytes_GET_SIZE(v))
5509 _PyBytes_Resize(&v, nsize);
5510 Py_XDECREF(errorHandler);
5511 Py_XDECREF(exc);
5512 done:
5513 return v;
5514 error:
5515 Py_XDECREF(rep);
5516 Py_XDECREF(errorHandler);
5517 Py_XDECREF(exc);
5518 Py_XDECREF(v);
5519 return NULL;
5520}
5521
5522PyObject *
5523PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5524 Py_ssize_t size,
5525 const char *errors,
5526 int byteorder)
5527{
5528 PyObject *result;
5529 PyObject *tmp = PyUnicode_FromWideChar(s, size);
5530 if (tmp == NULL)
5531 return NULL;
5532 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5533 Py_DECREF(tmp);
5534 return result;
5535}
5536
5537PyObject *
5538PyUnicode_AsUTF32String(PyObject *unicode)
5539{
5540 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5541}
5542
5543/* --- UTF-16 Codec ------------------------------------------------------- */
5544
5545PyObject *
5546PyUnicode_DecodeUTF16(const char *s,
5547 Py_ssize_t size,
5548 const char *errors,
5549 int *byteorder)
5550{
5551 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5552}
5553
5554PyObject *
5555PyUnicode_DecodeUTF16Stateful(const char *s,
5556 Py_ssize_t size,
5557 const char *errors,
5558 int *byteorder,
5559 Py_ssize_t *consumed)
5560{
5561 const char *starts = s;
5562 Py_ssize_t startinpos;
5563 Py_ssize_t endinpos;
5564 _PyUnicodeWriter writer;
5565 const unsigned char *q, *e;
5566 int bo = 0; /* assume native ordering by default */
5567 int native_ordering;
5568 const char *errmsg = "";
5569 PyObject *errorHandler = NULL;
5570 PyObject *exc = NULL;
5571 const char *encoding;
5572
5573 q = (unsigned char *)s;
5574 e = q + size;
5575
5576 if (byteorder)
5577 bo = *byteorder;
5578
5579 /* Check for BOM marks (U+FEFF) in the input and adjust current
5580 byte order setting accordingly. In native mode, the leading BOM
5581 mark is skipped, in all other modes, it is copied to the output
5582 stream as-is (giving a ZWNBSP character). */
5583 if (bo == 0 && size >= 2) {
5584 const Py_UCS4 bom = (q[1] << 8) | q[0];
5585 if (bom == 0xFEFF) {
5586 q += 2;
5587 bo = -1;
5588 }
5589 else if (bom == 0xFFFE) {
5590 q += 2;
5591 bo = 1;
5592 }
5593 if (byteorder)
5594 *byteorder = bo;
5595 }
5596
5597 if (q == e) {
5598 if (consumed)
5599 *consumed = size;
5600 _Py_RETURN_UNICODE_EMPTY();
5601 }
5602
5603#if PY_LITTLE_ENDIAN
5604 native_ordering = bo <= 0;
5605 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5606#else
5607 native_ordering = bo >= 0;
5608 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5609#endif
5610
5611 /* Note: size will always be longer than the resulting Unicode
5612 character count */
5613 _PyUnicodeWriter_Init(&writer);
5614 writer.min_length = (e - q + 1) / 2;
5615 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5616 goto onError;
5617
5618 while (1) {
5619 Py_UCS4 ch = 0;
5620 if (e - q >= 2) {
5621 int kind = writer.kind;
5622 if (kind == PyUnicode_1BYTE_KIND) {
5623 if (PyUnicode_IS_ASCII(writer.buffer))
5624 ch = asciilib_utf16_decode(&q, e,
5625 (Py_UCS1*)writer.data, &writer.pos,
5626 native_ordering);
5627 else
5628 ch = ucs1lib_utf16_decode(&q, e,
5629 (Py_UCS1*)writer.data, &writer.pos,
5630 native_ordering);
5631 } else if (kind == PyUnicode_2BYTE_KIND) {
5632 ch = ucs2lib_utf16_decode(&q, e,
5633 (Py_UCS2*)writer.data, &writer.pos,
5634 native_ordering);
5635 } else {
5636 assert(kind == PyUnicode_4BYTE_KIND);
5637 ch = ucs4lib_utf16_decode(&q, e,
5638 (Py_UCS4*)writer.data, &writer.pos,
5639 native_ordering);
5640 }
5641 }
5642
5643 switch (ch)
5644 {
5645 case 0:
5646 /* remaining byte at the end? (size should be even) */
5647 if (q == e || consumed)
5648 goto End;
5649 errmsg = "truncated data";
5650 startinpos = ((const char *)q) - starts;
5651 endinpos = ((const char *)e) - starts;
5652 break;
5653 /* The remaining input chars are ignored if the callback
5654 chooses to skip the input */
5655 case 1:
5656 q -= 2;
5657 if (consumed)
5658 goto End;
5659 errmsg = "unexpected end of data";
5660 startinpos = ((const char *)q) - starts;
5661 endinpos = ((const char *)e) - starts;
5662 break;
5663 case 2:
5664 errmsg = "illegal encoding";
5665 startinpos = ((const char *)q) - 2 - starts;
5666 endinpos = startinpos + 2;
5667 break;
5668 case 3:
5669 errmsg = "illegal UTF-16 surrogate";
5670 startinpos = ((const char *)q) - 4 - starts;
5671 endinpos = startinpos + 2;
5672 break;
5673 default:
5674 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5675 goto onError;
5676 continue;
5677 }
5678
5679 if (unicode_decode_call_errorhandler_writer(
5680 errors,
5681 &errorHandler,
5682 encoding, errmsg,
5683 &starts,
5684 (const char **)&e,
5685 &startinpos,
5686 &endinpos,
5687 &exc,
5688 (const char **)&q,
5689 &writer))
5690 goto onError;
5691 }
5692
5693End:
5694 if (consumed)
5695 *consumed = (const char *)q-starts;
5696
5697 Py_XDECREF(errorHandler);
5698 Py_XDECREF(exc);
5699 return _PyUnicodeWriter_Finish(&writer);
5700
5701 onError:
5702 _PyUnicodeWriter_Dealloc(&writer);
5703 Py_XDECREF(errorHandler);
5704 Py_XDECREF(exc);
5705 return NULL;
5706}
5707
5708PyObject *
5709_PyUnicode_EncodeUTF16(PyObject *str,
5710 const char *errors,
5711 int byteorder)
5712{
5713 enum PyUnicode_Kind kind;
5714 const void *data;
5715 Py_ssize_t len;
5716 PyObject *v;
5717 unsigned short *out;
5718 Py_ssize_t pairs;
5719#if PY_BIG_ENDIAN
5720 int native_ordering = byteorder >= 0;
5721#else
5722 int native_ordering = byteorder <= 0;
5723#endif
5724 const char *encoding;
5725 Py_ssize_t nsize, pos;
5726 PyObject *errorHandler = NULL;
5727 PyObject *exc = NULL;
5728 PyObject *rep = NULL;
5729
5730 if (!PyUnicode_Check(str)) {
5731 PyErr_BadArgument();
5732 return NULL;
5733 }
5734 if (PyUnicode_READY(str) == -1)
5735 return NULL;
5736 kind = PyUnicode_KIND(str);
5737 data = PyUnicode_DATA(str);
5738 len = PyUnicode_GET_LENGTH(str);
5739
5740 pairs = 0;
5741 if (kind == PyUnicode_4BYTE_KIND) {
5742 const Py_UCS4 *in = (const Py_UCS4 *)data;
5743 const Py_UCS4 *end = in + len;
5744 while (in < end) {
5745 if (*in++ >= 0x10000) {
5746 pairs++;
5747 }
5748 }
5749 }
5750 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5751 return PyErr_NoMemory();
5752 }
5753 nsize = len + pairs + (byteorder == 0);
5754 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5755 if (v == NULL) {
5756 return NULL;
5757 }
5758
5759 /* output buffer is 2-bytes aligned */
5760 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5761 out = (unsigned short *)PyBytes_AS_STRING(v);
5762 if (byteorder == 0) {
5763 *out++ = 0xFEFF;
5764 }
5765 if (len == 0) {
5766 goto done;
5767 }
5768
5769 if (kind == PyUnicode_1BYTE_KIND) {
5770 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5771 goto done;
5772 }
5773
5774 if (byteorder < 0) {
5775 encoding = "utf-16-le";
5776 }
5777 else if (byteorder > 0) {
5778 encoding = "utf-16-be";
5779 }
5780 else {
5781 encoding = "utf-16";
5782 }
5783
5784 pos = 0;
5785 while (pos < len) {
5786 Py_ssize_t repsize, moreunits;
5787
5788 if (kind == PyUnicode_2BYTE_KIND) {
5789 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5790 &out, native_ordering);
5791 }
5792 else {
5793 assert(kind == PyUnicode_4BYTE_KIND);
5794 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5795 &out, native_ordering);
5796 }
5797 if (pos == len)
5798 break;
5799
5800 rep = unicode_encode_call_errorhandler(
5801 errors, &errorHandler,
5802 encoding, "surrogates not allowed",
5803 str, &exc, pos, pos + 1, &pos);
5804 if (!rep)
5805 goto error;
5806
5807 if (PyBytes_Check(rep)) {
5808 repsize = PyBytes_GET_SIZE(rep);
5809 if (repsize & 1) {
5810 raise_encode_exception(&exc, encoding,
5811 str, pos - 1, pos,
5812 "surrogates not allowed");
5813 goto error;
5814 }
5815 moreunits = repsize / 2;
5816 }
5817 else {
5818 assert(PyUnicode_Check(rep));
5819 if (PyUnicode_READY(rep) < 0)
5820 goto error;
5821 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5822 if (!PyUnicode_IS_ASCII(rep)) {
5823 raise_encode_exception(&exc, encoding,
5824 str, pos - 1, pos,
5825 "surrogates not allowed");
5826 goto error;
5827 }
5828 }
5829
5830 /* two bytes are reserved for each surrogate */
5831 if (moreunits > 1) {
5832 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5833 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
5834 /* integer overflow */
5835 PyErr_NoMemory();
5836 goto error;
5837 }
5838 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
5839 goto error;
5840 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5841 }
5842
5843 if (PyBytes_Check(rep)) {
5844 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5845 out += moreunits;
5846 } else /* rep is unicode */ {
5847 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5848 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5849 &out, native_ordering);
5850 }
5851
5852 Py_CLEAR(rep);
5853 }
5854
5855 /* Cut back to size actually needed. This is necessary for, for example,
5856 encoding of a string containing isolated surrogates and the 'ignore' handler
5857 is used. */
5858 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5859 if (nsize != PyBytes_GET_SIZE(v))
5860 _PyBytes_Resize(&v, nsize);
5861 Py_XDECREF(errorHandler);
5862 Py_XDECREF(exc);
5863 done:
5864 return v;
5865 error:
5866 Py_XDECREF(rep);
5867 Py_XDECREF(errorHandler);
5868 Py_XDECREF(exc);
5869 Py_XDECREF(v);
5870 return NULL;
5871#undef STORECHAR
5872}
5873
5874PyObject *
5875PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5876 Py_ssize_t size,
5877 const char *errors,
5878 int byteorder)
5879{
5880 PyObject *result;
5881 PyObject *tmp = PyUnicode_FromWideChar(s, size);
5882 if (tmp == NULL)
5883 return NULL;
5884 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5885 Py_DECREF(tmp);
5886 return result;
5887}
5888
5889PyObject *
5890PyUnicode_AsUTF16String(PyObject *unicode)
5891{
5892 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5893}
5894
5895/* --- Unicode Escape Codec ----------------------------------------------- */
5896
5897static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5898
5899PyObject *
5900_PyUnicode_DecodeUnicodeEscape(const char *s,
5901 Py_ssize_t size,
5902 const char *errors,
5903 const char **first_invalid_escape)
5904{
5905 const char *starts = s;
5906 _PyUnicodeWriter writer;
5907 const char *end;
5908 PyObject *errorHandler = NULL;
5909 PyObject *exc = NULL;
5910
5911 // so we can remember if we've seen an invalid escape char or not
5912 *first_invalid_escape = NULL;
5913
5914 if (size == 0) {
5915 _Py_RETURN_UNICODE_EMPTY();
5916 }
5917 /* Escaped strings will always be longer than the resulting
5918 Unicode string, so we start with size here and then reduce the
5919 length after conversion to the true value.
5920 (but if the error callback returns a long replacement string
5921 we'll have to allocate more space) */
5922 _PyUnicodeWriter_Init(&writer);
5923 writer.min_length = size;
5924 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5925 goto onError;
5926 }
5927
5928 end = s + size;
5929 while (s < end) {
5930 unsigned char c = (unsigned char) *s++;
5931 Py_UCS4 ch;
5932 int count;
5933 Py_ssize_t startinpos;
5934 Py_ssize_t endinpos;
5935 const char *message;
5936
5937#define WRITE_ASCII_CHAR(ch) \
5938 do { \
5939 assert(ch <= 127); \
5940 assert(writer.pos < writer.size); \
5941 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5942 } while(0)
5943
5944#define WRITE_CHAR(ch) \
5945 do { \
5946 if (ch <= writer.maxchar) { \
5947 assert(writer.pos < writer.size); \
5948 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5949 } \
5950 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5951 goto onError; \
5952 } \
5953 } while(0)
5954
5955 /* Non-escape characters are interpreted as Unicode ordinals */
5956 if (c != '\\') {
5957 WRITE_CHAR(c);
5958 continue;
5959 }
5960
5961 startinpos = s - starts - 1;
5962 /* \ - Escapes */
5963 if (s >= end) {
5964 message = "\\ at end of string";
5965 goto error;
5966 }
5967 c = (unsigned char) *s++;
5968
5969 assert(writer.pos < writer.size);
5970 switch (c) {
5971
5972 /* \x escapes */
5973 case '\n': continue;
5974 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5975 case '\'': WRITE_ASCII_CHAR('\''); continue;
5976 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5977 case 'b': WRITE_ASCII_CHAR('\b'); continue;
5978 /* FF */
5979 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5980 case 't': WRITE_ASCII_CHAR('\t'); continue;
5981 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5982 case 'r': WRITE_ASCII_CHAR('\r'); continue;
5983 /* VT */
5984 case 'v': WRITE_ASCII_CHAR('\013'); continue;
5985 /* BEL, not classic C */
5986 case 'a': WRITE_ASCII_CHAR('\007'); continue;
5987
5988 /* \OOO (octal) escapes */
5989 case '0': case '1': case '2': case '3':
5990 case '4': case '5': case '6': case '7':
5991 ch = c - '0';
5992 if (s < end && '0' <= *s && *s <= '7') {
5993 ch = (ch<<3) + *s++ - '0';
5994 if (s < end && '0' <= *s && *s <= '7') {
5995 ch = (ch<<3) + *s++ - '0';
5996 }
5997 }
5998 WRITE_CHAR(ch);
5999 continue;
6000
6001 /* hex escapes */
6002 /* \xXX */
6003 case 'x':
6004 count = 2;
6005 message = "truncated \\xXX escape";
6006 goto hexescape;
6007
6008 /* \uXXXX */
6009 case 'u':
6010 count = 4;
6011 message = "truncated \\uXXXX escape";
6012 goto hexescape;
6013
6014 /* \UXXXXXXXX */
6015 case 'U':
6016 count = 8;
6017 message = "truncated \\UXXXXXXXX escape";
6018 hexescape:
6019 for (ch = 0; count && s < end; ++s, --count) {
6020 c = (unsigned char)*s;
6021 ch <<= 4;
6022 if (c >= '0' && c <= '9') {
6023 ch += c - '0';
6024 }
6025 else if (c >= 'a' && c <= 'f') {
6026 ch += c - ('a' - 10);
6027 }
6028 else if (c >= 'A' && c <= 'F') {
6029 ch += c - ('A' - 10);
6030 }
6031 else {
6032 break;
6033 }
6034 }
6035 if (count) {
6036 goto error;
6037 }
6038
6039 /* when we get here, ch is a 32-bit unicode character */
6040 if (ch > MAX_UNICODE) {
6041 message = "illegal Unicode character";
6042 goto error;
6043 }
6044
6045 WRITE_CHAR(ch);
6046 continue;
6047
6048 /* \N{name} */
6049 case 'N':
6050 if (ucnhash_CAPI == NULL) {
6051 /* load the unicode data module */
6052 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6053 PyUnicodeData_CAPSULE_NAME, 1);
6054 if (ucnhash_CAPI == NULL) {
6055 PyErr_SetString(
6056 PyExc_UnicodeError,
6057 "\\N escapes not supported (can't load unicodedata module)"
6058 );
6059 goto onError;
6060 }
6061 }
6062
6063 message = "malformed \\N character escape";
6064 if (*s == '{') {
6065 const char *start = ++s;
6066 size_t namelen;
6067 /* look for the closing brace */
6068 while (s < end && *s != '}')
6069 s++;
6070 namelen = s - start;
6071 if (namelen && s < end) {
6072 /* found a name. look it up in the unicode database */
6073 s++;
6074 ch = 0xffffffff; /* in case 'getcode' messes up */
6075 if (namelen <= INT_MAX &&
6076 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6077 &ch, 0)) {
6078 assert(ch <= MAX_UNICODE);
6079 WRITE_CHAR(ch);
6080 continue;
6081 }
6082 message = "unknown Unicode character name";
6083 }
6084 }
6085 goto error;
6086
6087 default:
6088 if (*first_invalid_escape == NULL) {
6089 *first_invalid_escape = s-1; /* Back up one char, since we've
6090 already incremented s. */
6091 }
6092 WRITE_ASCII_CHAR('\\');
6093 WRITE_CHAR(c);
6094 continue;
6095 }
6096
6097 error:
6098 endinpos = s-starts;
6099 writer.min_length = end - s + writer.pos;
6100 if (unicode_decode_call_errorhandler_writer(
6101 errors, &errorHandler,
6102 "unicodeescape", message,
6103 &starts, &end, &startinpos, &endinpos, &exc, &s,
6104 &writer)) {
6105 goto onError;
6106 }
6107 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6108 goto onError;
6109 }
6110
6111#undef WRITE_ASCII_CHAR
6112#undef WRITE_CHAR
6113 }
6114
6115 Py_XDECREF(errorHandler);
6116 Py_XDECREF(exc);
6117 return _PyUnicodeWriter_Finish(&writer);
6118
6119 onError:
6120 _PyUnicodeWriter_Dealloc(&writer);
6121 Py_XDECREF(errorHandler);
6122 Py_XDECREF(exc);
6123 return NULL;
6124}
6125
6126PyObject *
6127PyUnicode_DecodeUnicodeEscape(const char *s,
6128 Py_ssize_t size,
6129 const char *errors)
6130{
6131 const char *first_invalid_escape;
6132 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6133 &first_invalid_escape);
6134 if (result == NULL)
6135 return NULL;
6136 if (first_invalid_escape != NULL) {
6137 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6138 "invalid escape sequence '\\%c'",
6139 *first_invalid_escape) < 0) {
6140 Py_DECREF(result);
6141 return NULL;
6142 }
6143 }
6144 return result;
6145}
6146
6147/* Return a Unicode-Escape string version of the Unicode object. */
6148
6149PyObject *
6150PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6151{
6152 Py_ssize_t i, len;
6153 PyObject *repr;
6154 char *p;
6155 enum PyUnicode_Kind kind;
6156 void *data;
6157 Py_ssize_t expandsize;
6158
6159 /* Initial allocation is based on the longest-possible character
6160 escape.
6161
6162 For UCS1 strings it's '\xxx', 4 bytes per source character.
6163 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6164 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6165 */
6166
6167 if (!PyUnicode_Check(unicode)) {
6168 PyErr_BadArgument();
6169 return NULL;
6170 }
6171 if (PyUnicode_READY(unicode) == -1) {
6172 return NULL;
6173 }
6174
6175 len = PyUnicode_GET_LENGTH(unicode);
6176 if (len == 0) {
6177 return PyBytes_FromStringAndSize(NULL, 0);
6178 }
6179
6180 kind = PyUnicode_KIND(unicode);
6181 data = PyUnicode_DATA(unicode);
6182 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6183 bytes, and 1 byte characters 4. */
6184 expandsize = kind * 2 + 2;
6185 if (len > PY_SSIZE_T_MAX / expandsize) {
6186 return PyErr_NoMemory();
6187 }
6188 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6189 if (repr == NULL) {
6190 return NULL;
6191 }
6192
6193 p = PyBytes_AS_STRING(repr);
6194 for (i = 0; i < len; i++) {
6195 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6196
6197 /* U+0000-U+00ff range */
6198 if (ch < 0x100) {
6199 if (ch >= ' ' && ch < 127) {
6200 if (ch != '\\') {
6201 /* Copy printable US ASCII as-is */
6202 *p++ = (char) ch;
6203 }
6204 /* Escape backslashes */
6205 else {
6206 *p++ = '\\';
6207 *p++ = '\\';
6208 }
6209 }
6210
6211 /* Map special whitespace to '\t', \n', '\r' */
6212 else if (ch == '\t') {
6213 *p++ = '\\';
6214 *p++ = 't';
6215 }
6216 else if (ch == '\n') {
6217 *p++ = '\\';
6218 *p++ = 'n';
6219 }
6220 else if (ch == '\r') {
6221 *p++ = '\\';
6222 *p++ = 'r';
6223 }
6224
6225 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6226 else {
6227 *p++ = '\\';
6228 *p++ = 'x';
6229 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230 *p++ = Py_hexdigits[ch & 0x000F];
6231 }
6232 }
6233 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6234 else if (ch < 0x10000) {
6235 *p++ = '\\';
6236 *p++ = 'u';
6237 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6238 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6239 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6240 *p++ = Py_hexdigits[ch & 0x000F];
6241 }
6242 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6243 else {
6244
6245 /* Make sure that the first two digits are zero */
6246 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6247 *p++ = '\\';
6248 *p++ = 'U';
6249 *p++ = '0';
6250 *p++ = '0';
6251 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6252 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6256 *p++ = Py_hexdigits[ch & 0x0000000F];
6257 }
6258 }
6259
6260 assert(p - PyBytes_AS_STRING(repr) > 0);
6261 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6262 return NULL;
6263 }
6264 return repr;
6265}
6266
6267PyObject *
6268PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6269 Py_ssize_t size)
6270{
6271 PyObject *result;
6272 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6273 if (tmp == NULL) {
6274 return NULL;
6275 }
6276
6277 result = PyUnicode_AsUnicodeEscapeString(tmp);
6278 Py_DECREF(tmp);
6279 return result;
6280}
6281
6282/* --- Raw Unicode Escape Codec ------------------------------------------- */
6283
6284PyObject *
6285PyUnicode_DecodeRawUnicodeEscape(const char *s,
6286 Py_ssize_t size,
6287 const char *errors)
6288{
6289 const char *starts = s;
6290 _PyUnicodeWriter writer;
6291 const char *end;
6292 PyObject *errorHandler = NULL;
6293 PyObject *exc = NULL;
6294
6295 if (size == 0) {
6296 _Py_RETURN_UNICODE_EMPTY();
6297 }
6298
6299 /* Escaped strings will always be longer than the resulting
6300 Unicode string, so we start with size here and then reduce the
6301 length after conversion to the true value. (But decoding error
6302 handler might have to resize the string) */
6303 _PyUnicodeWriter_Init(&writer);
6304 writer.min_length = size;
6305 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6306 goto onError;
6307 }
6308
6309 end = s + size;
6310 while (s < end) {
6311 unsigned char c = (unsigned char) *s++;
6312 Py_UCS4 ch;
6313 int count;
6314 Py_ssize_t startinpos;
6315 Py_ssize_t endinpos;
6316 const char *message;
6317
6318#define WRITE_CHAR(ch) \
6319 do { \
6320 if (ch <= writer.maxchar) { \
6321 assert(writer.pos < writer.size); \
6322 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6323 } \
6324 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6325 goto onError; \
6326 } \
6327 } while(0)
6328
6329 /* Non-escape characters are interpreted as Unicode ordinals */
6330 if (c != '\\' || s >= end) {
6331 WRITE_CHAR(c);
6332 continue;
6333 }
6334
6335 c = (unsigned char) *s++;
6336 if (c == 'u') {
6337 count = 4;
6338 message = "truncated \\uXXXX escape";
6339 }
6340 else if (c == 'U') {
6341 count = 8;
6342 message = "truncated \\UXXXXXXXX escape";
6343 }
6344 else {
6345 assert(writer.pos < writer.size);
6346 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6347 WRITE_CHAR(c);
6348 continue;
6349 }
6350 startinpos = s - starts - 2;
6351
6352 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6353 for (ch = 0; count && s < end; ++s, --count) {
6354 c = (unsigned char)*s;
6355 ch <<= 4;
6356 if (c >= '0' && c <= '9') {
6357 ch += c - '0';
6358 }
6359 else if (c >= 'a' && c <= 'f') {
6360 ch += c - ('a' - 10);
6361 }
6362 else if (c >= 'A' && c <= 'F') {
6363 ch += c - ('A' - 10);
6364 }
6365 else {
6366 break;
6367 }
6368 }
6369 if (!count) {
6370 if (ch <= MAX_UNICODE) {
6371 WRITE_CHAR(ch);
6372 continue;
6373 }
6374 message = "\\Uxxxxxxxx out of range";
6375 }
6376
6377 endinpos = s-starts;
6378 writer.min_length = end - s + writer.pos;
6379 if (unicode_decode_call_errorhandler_writer(
6380 errors, &errorHandler,
6381 "rawunicodeescape", message,
6382 &starts, &end, &startinpos, &endinpos, &exc, &s,
6383 &writer)) {
6384 goto onError;
6385 }
6386 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6387 goto onError;
6388 }
6389
6390#undef WRITE_CHAR
6391 }
6392 Py_XDECREF(errorHandler);
6393 Py_XDECREF(exc);
6394 return _PyUnicodeWriter_Finish(&writer);
6395
6396 onError:
6397 _PyUnicodeWriter_Dealloc(&writer);
6398 Py_XDECREF(errorHandler);
6399 Py_XDECREF(exc);
6400 return NULL;
6401
6402}
6403
6404
6405PyObject *
6406PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6407{
6408 PyObject *repr;
6409 char *p;
6410 Py_ssize_t expandsize, pos;
6411 int kind;
6412 void *data;
6413 Py_ssize_t len;
6414
6415 if (!PyUnicode_Check(unicode)) {
6416 PyErr_BadArgument();
6417 return NULL;
6418 }
6419 if (PyUnicode_READY(unicode) == -1) {
6420 return NULL;
6421 }
6422 kind = PyUnicode_KIND(unicode);
6423 data = PyUnicode_DATA(unicode);
6424 len = PyUnicode_GET_LENGTH(unicode);
6425 if (kind == PyUnicode_1BYTE_KIND) {
6426 return PyBytes_FromStringAndSize(data, len);
6427 }
6428
6429 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6430 bytes, and 1 byte characters 4. */
6431 expandsize = kind * 2 + 2;
6432
6433 if (len > PY_SSIZE_T_MAX / expandsize) {
6434 return PyErr_NoMemory();
6435 }
6436 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6437 if (repr == NULL) {
6438 return NULL;
6439 }
6440 if (len == 0) {
6441 return repr;
6442 }
6443
6444 p = PyBytes_AS_STRING(repr);
6445 for (pos = 0; pos < len; pos++) {
6446 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6447
6448 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6449 if (ch < 0x100) {
6450 *p++ = (char) ch;
6451 }
6452 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6453 else if (ch < 0x10000) {
6454 *p++ = '\\';
6455 *p++ = 'u';
6456 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6457 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6458 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6459 *p++ = Py_hexdigits[ch & 15];
6460 }
6461 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6462 else {
6463 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6464 *p++ = '\\';
6465 *p++ = 'U';
6466 *p++ = '0';
6467 *p++ = '0';
6468 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6469 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6473 *p++ = Py_hexdigits[ch & 15];
6474 }
6475 }
6476
6477 assert(p > PyBytes_AS_STRING(repr));
6478 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6479 return NULL;
6480 }
6481 return repr;
6482}
6483
6484PyObject *
6485PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6486 Py_ssize_t size)
6487{
6488 PyObject *result;
6489 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6490 if (tmp == NULL)
6491 return NULL;
6492 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6493 Py_DECREF(tmp);
6494 return result;
6495}
6496
6497/* --- Unicode Internal Codec ------------------------------------------- */
6498
6499PyObject *
6500_PyUnicode_DecodeUnicodeInternal(const char *s,
6501 Py_ssize_t size,
6502 const char *errors)
6503{
6504 const char *starts = s;
6505 Py_ssize_t startinpos;
6506 Py_ssize_t endinpos;
6507 _PyUnicodeWriter writer;
6508 const char *end;
6509 const char *reason;
6510 PyObject *errorHandler = NULL;
6511 PyObject *exc = NULL;
6512
6513 if (PyErr_WarnEx(PyExc_DeprecationWarning,
6514 "unicode_internal codec has been deprecated",
6515 1))
6516 return NULL;
6517
6518 if (size < 0) {
6519 PyErr_BadInternalCall();
6520 return NULL;
6521 }
6522 if (size == 0)
6523 _Py_RETURN_UNICODE_EMPTY();
6524
6525 _PyUnicodeWriter_Init(&writer);
6526 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6527 PyErr_NoMemory();
6528 goto onError;
6529 }
6530 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6531
6532 end = s + size;
6533 while (s < end) {
6534 Py_UNICODE uch;
6535 Py_UCS4 ch;
6536 if (end - s < Py_UNICODE_SIZE) {
6537 endinpos = end-starts;
6538 reason = "truncated input";
6539 goto error;
6540 }
6541 /* We copy the raw representation one byte at a time because the
6542 pointer may be unaligned (see test_codeccallbacks). */
6543 ((char *) &uch)[0] = s[0];
6544 ((char *) &uch)[1] = s[1];
6545#ifdef Py_UNICODE_WIDE
6546 ((char *) &uch)[2] = s[2];
6547 ((char *) &uch)[3] = s[3];
6548#endif
6549 ch = uch;
6550#ifdef Py_UNICODE_WIDE
6551 /* We have to sanity check the raw data, otherwise doom looms for
6552 some malformed UCS-4 data. */
6553 if (ch > 0x10ffff) {
6554 endinpos = s - starts + Py_UNICODE_SIZE;
6555 reason = "illegal code point (> 0x10FFFF)";
6556 goto error;
6557 }
6558#endif
6559 s += Py_UNICODE_SIZE;
6560#ifndef Py_UNICODE_WIDE
6561 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6562 {
6563 Py_UNICODE uch2;
6564 ((char *) &uch2)[0] = s[0];
6565 ((char *) &uch2)[1] = s[1];
6566 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6567 {
6568 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6569 s += Py_UNICODE_SIZE;
6570 }
6571 }
6572#endif
6573
6574 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6575 goto onError;
6576 continue;
6577
6578 error:
6579 startinpos = s - starts;
6580 if (unicode_decode_call_errorhandler_writer(
6581 errors, &errorHandler,
6582 "unicode_internal", reason,
6583 &starts, &end, &startinpos, &endinpos, &exc, &s,
6584 &writer))
6585 goto onError;
6586 }
6587
6588 Py_XDECREF(errorHandler);
6589 Py_XDECREF(exc);
6590 return _PyUnicodeWriter_Finish(&writer);
6591
6592 onError:
6593 _PyUnicodeWriter_Dealloc(&writer);
6594 Py_XDECREF(errorHandler);
6595 Py_XDECREF(exc);
6596 return NULL;
6597}
6598
6599/* --- Latin-1 Codec ------------------------------------------------------ */
6600
6601PyObject *
6602PyUnicode_DecodeLatin1(const char *s,
6603 Py_ssize_t size,
6604 const char *errors)
6605{
6606 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6607 return _PyUnicode_FromUCS1((unsigned char*)s, size);
6608}
6609
6610/* create or adjust a UnicodeEncodeError */
6611static void
6612make_encode_exception(PyObject **exceptionObject,
6613 const char *encoding,
6614 PyObject *unicode,
6615 Py_ssize_t startpos, Py_ssize_t endpos,
6616 const char *reason)
6617{
6618 if (*exceptionObject == NULL) {
6619 *exceptionObject = PyObject_CallFunction(
6620 PyExc_UnicodeEncodeError, "sOnns",
6621 encoding, unicode, startpos, endpos, reason);
6622 }
6623 else {
6624 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6625 goto onError;
6626 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6627 goto onError;
6628 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6629 goto onError;
6630 return;
6631 onError:
6632 Py_CLEAR(*exceptionObject);
6633 }
6634}
6635
6636/* raises a UnicodeEncodeError */
6637static void
6638raise_encode_exception(PyObject **exceptionObject,
6639 const char *encoding,
6640 PyObject *unicode,
6641 Py_ssize_t startpos, Py_ssize_t endpos,
6642 const char *reason)
6643{
6644 make_encode_exception(exceptionObject,
6645 encoding, unicode, startpos, endpos, reason);
6646 if (*exceptionObject != NULL)
6647 PyCodec_StrictErrors(*exceptionObject);
6648}
6649
6650/* error handling callback helper:
6651 build arguments, call the callback and check the arguments,
6652 put the result into newpos and return the replacement string, which
6653 has to be freed by the caller */
6654static PyObject *
6655unicode_encode_call_errorhandler(const char *errors,
6656 PyObject **errorHandler,
6657 const char *encoding, const char *reason,
6658 PyObject *unicode, PyObject **exceptionObject,
6659 Py_ssize_t startpos, Py_ssize_t endpos,
6660 Py_ssize_t *newpos)
6661{
6662 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6663 Py_ssize_t len;
6664 PyObject *restuple;
6665 PyObject *resunicode;
6666
6667 if (*errorHandler == NULL) {
6668 *errorHandler = PyCodec_LookupError(errors);
6669 if (*errorHandler == NULL)
6670 return NULL;
6671 }
6672
6673 if (PyUnicode_READY(unicode) == -1)
6674 return NULL;
6675 len = PyUnicode_GET_LENGTH(unicode);
6676
6677 make_encode_exception(exceptionObject,
6678 encoding, unicode, startpos, endpos, reason);
6679 if (*exceptionObject == NULL)
6680 return NULL;
6681
6682 restuple = PyObject_CallFunctionObjArgs(
6683 *errorHandler, *exceptionObject, NULL);
6684 if (restuple == NULL)
6685 return NULL;
6686 if (!PyTuple_Check(restuple)) {
6687 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6688 Py_DECREF(restuple);
6689 return NULL;
6690 }
6691 if (!PyArg_ParseTuple(restuple, argparse,
6692 &resunicode, newpos)) {
6693 Py_DECREF(restuple);
6694 return NULL;
6695 }
6696 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6697 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6698 Py_DECREF(restuple);
6699 return NULL;
6700 }
6701 if (*newpos<0)
6702 *newpos = len + *newpos;
6703 if (*newpos<0 || *newpos>len) {
6704 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6705 Py_DECREF(restuple);
6706 return NULL;
6707 }
6708 Py_INCREF(resunicode);
6709 Py_DECREF(restuple);
6710 return resunicode;
6711}
6712
6713static PyObject *
6714unicode_encode_ucs1(PyObject *unicode,
6715 const char *errors,
6716 const Py_UCS4 limit)
6717{
6718 /* input state */
6719 Py_ssize_t pos=0, size;
6720 int kind;
6721 void *data;
6722 /* pointer into the output */
6723 char *str;
6724 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6725 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6726 PyObject *error_handler_obj = NULL;
6727 PyObject *exc = NULL;
6728 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6729 PyObject *rep = NULL;
6730 /* output object */
6731 _PyBytesWriter writer;
6732
6733 if (PyUnicode_READY(unicode) == -1)
6734 return NULL;
6735 size = PyUnicode_GET_LENGTH(unicode);
6736 kind = PyUnicode_KIND(unicode);
6737 data = PyUnicode_DATA(unicode);
6738 /* allocate enough for a simple encoding without
6739 replacements, if we need more, we'll resize */
6740 if (size == 0)
6741 return PyBytes_FromStringAndSize(NULL, 0);
6742
6743 _PyBytesWriter_Init(&writer);
6744 str = _PyBytesWriter_Alloc(&writer, size);
6745 if (str == NULL)
6746 return NULL;
6747
6748 while (pos < size) {
6749 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6750
6751 /* can we encode this? */
6752 if (ch < limit) {
6753 /* no overflow check, because we know that the space is enough */
6754 *str++ = (char)ch;
6755 ++pos;
6756 }
6757 else {
6758 Py_ssize_t newpos, i;
6759 /* startpos for collecting unencodable chars */
6760 Py_ssize_t collstart = pos;
6761 Py_ssize_t collend = collstart + 1;
6762 /* find all unecodable characters */
6763
6764 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6765 ++collend;
6766
6767 /* Only overallocate the buffer if it's not the last write */
6768 writer.overallocate = (collend < size);
6769
6770 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6771 if (error_handler == _Py_ERROR_UNKNOWN)
6772 error_handler = get_error_handler(errors);
6773
6774 switch (error_handler) {
6775 case _Py_ERROR_STRICT:
6776 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6777 goto onError;
6778
6779 case _Py_ERROR_REPLACE:
6780 memset(str, '?', collend - collstart);
6781 str += (collend - collstart);
6782 /* fall through */
6783 case _Py_ERROR_IGNORE:
6784 pos = collend;
6785 break;
6786
6787 case _Py_ERROR_BACKSLASHREPLACE:
6788 /* subtract preallocated bytes */
6789 writer.min_size -= (collend - collstart);
6790 str = backslashreplace(&writer, str,
6791 unicode, collstart, collend);
6792 if (str == NULL)
6793 goto onError;
6794 pos = collend;
6795 break;
6796
6797 case _Py_ERROR_XMLCHARREFREPLACE:
6798 /* subtract preallocated bytes */
6799 writer.min_size -= (collend - collstart);
6800 str = xmlcharrefreplace(&writer, str,
6801 unicode, collstart, collend);
6802 if (str == NULL)
6803 goto onError;
6804 pos = collend;
6805 break;
6806
6807 case _Py_ERROR_SURROGATEESCAPE:
6808 for (i = collstart; i < collend; ++i) {
6809 ch = PyUnicode_READ(kind, data, i);
6810 if (ch < 0xdc80 || 0xdcff < ch) {
6811 /* Not a UTF-8b surrogate */
6812 break;
6813 }
6814 *str++ = (char)(ch - 0xdc00);
6815 ++pos;
6816 }
6817 if (i >= collend)
6818 break;
6819 collstart = pos;
6820 assert(collstart != collend);
6821 /* fall through */
6822
6823 default:
6824 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6825 encoding, reason, unicode, &exc,
6826 collstart, collend, &newpos);
6827 if (rep == NULL)
6828 goto onError;
6829
6830 /* subtract preallocated bytes */
6831 writer.min_size -= newpos - collstart;
6832
6833 if (PyBytes_Check(rep)) {
6834 /* Directly copy bytes result to output. */
6835 str = _PyBytesWriter_WriteBytes(&writer, str,
6836 PyBytes_AS_STRING(rep),
6837 PyBytes_GET_SIZE(rep));
6838 if (str == NULL)
6839 goto onError;
6840 }
6841 else {
6842 assert(PyUnicode_Check(rep));
6843
6844 if (PyUnicode_READY(rep) < 0)
6845 goto onError;
6846
6847 if (limit == 256 ?
6848 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6849 !PyUnicode_IS_ASCII(rep))
6850 {
6851 /* Not all characters are smaller than limit */
6852 raise_encode_exception(&exc, encoding, unicode,
6853 collstart, collend, reason);
6854 goto onError;
6855 }
6856 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6857 str = _PyBytesWriter_WriteBytes(&writer, str,
6858 PyUnicode_DATA(rep),
6859 PyUnicode_GET_LENGTH(rep));
6860 }
6861 pos = newpos;
6862 Py_CLEAR(rep);
6863 }
6864
6865 /* If overallocation was disabled, ensure that it was the last
6866 write. Otherwise, we missed an optimization */
6867 assert(writer.overallocate || pos == size);
6868 }
6869 }
6870
6871 Py_XDECREF(error_handler_obj);
6872 Py_XDECREF(exc);
6873 return _PyBytesWriter_Finish(&writer, str);
6874
6875 onError:
6876 Py_XDECREF(rep);
6877 _PyBytesWriter_Dealloc(&writer);
6878 Py_XDECREF(error_handler_obj);
6879 Py_XDECREF(exc);
6880 return NULL;
6881}
6882
6883/* Deprecated */
6884PyObject *
6885PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6886 Py_ssize_t size,
6887 const char *errors)
6888{
6889 PyObject *result;
6890 PyObject *unicode = PyUnicode_FromWideChar(p, size);
6891 if (unicode == NULL)
6892 return NULL;
6893 result = unicode_encode_ucs1(unicode, errors, 256);
6894 Py_DECREF(unicode);
6895 return result;
6896}
6897
6898PyObject *
6899_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6900{
6901 if (!PyUnicode_Check(unicode)) {
6902 PyErr_BadArgument();
6903 return NULL;
6904 }
6905 if (PyUnicode_READY(unicode) == -1)
6906 return NULL;
6907 /* Fast path: if it is a one-byte string, construct
6908 bytes object directly. */
6909 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6910 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6911 PyUnicode_GET_LENGTH(unicode));
6912 /* Non-Latin-1 characters present. Defer to above function to
6913 raise the exception. */
6914 return unicode_encode_ucs1(unicode, errors, 256);
6915}
6916
6917PyObject*
6918PyUnicode_AsLatin1String(PyObject *unicode)
6919{
6920 return _PyUnicode_AsLatin1String(unicode, NULL);
6921}
6922
6923/* --- 7-bit ASCII Codec -------------------------------------------------- */
6924
6925PyObject *
6926PyUnicode_DecodeASCII(const char *s,
6927 Py_ssize_t size,
6928 const char *errors)
6929{
6930 const char *starts = s;
6931 _PyUnicodeWriter writer;
6932 int kind;
6933 void *data;
6934 Py_ssize_t startinpos;
6935 Py_ssize_t endinpos;
6936 Py_ssize_t outpos;
6937 const char *e;
6938 PyObject *error_handler_obj = NULL;
6939 PyObject *exc = NULL;
6940 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6941
6942 if (size == 0)
6943 _Py_RETURN_UNICODE_EMPTY();
6944
6945 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6946 if (size == 1 && (unsigned char)s[0] < 128)
6947 return get_latin1_char((unsigned char)s[0]);
6948
6949 _PyUnicodeWriter_Init(&writer);
6950 writer.min_length = size;
6951 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6952 return NULL;
6953
6954 e = s + size;
6955 data = writer.data;
6956 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6957 writer.pos = outpos;
6958 if (writer.pos == size)
6959 return _PyUnicodeWriter_Finish(&writer);
6960
6961 s += writer.pos;
6962 kind = writer.kind;
6963 while (s < e) {
6964 unsigned char c = (unsigned char)*s;
6965 if (c < 128) {
6966 PyUnicode_WRITE(kind, data, writer.pos, c);
6967 writer.pos++;
6968 ++s;
6969 continue;
6970 }
6971
6972 /* byte outsize range 0x00..0x7f: call the error handler */
6973
6974 if (error_handler == _Py_ERROR_UNKNOWN)
6975 error_handler = get_error_handler(errors);
6976
6977 switch (error_handler)
6978 {
6979 case _Py_ERROR_REPLACE:
6980 case _Py_ERROR_SURROGATEESCAPE:
6981 /* Fast-path: the error handler only writes one character,
6982 but we may switch to UCS2 at the first write */
6983 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6984 goto onError;
6985 kind = writer.kind;
6986 data = writer.data;
6987
6988 if (error_handler == _Py_ERROR_REPLACE)
6989 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6990 else
6991 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6992 writer.pos++;
6993 ++s;
6994 break;
6995
6996 case _Py_ERROR_IGNORE:
6997 ++s;
6998 break;
6999
7000 default:
7001 startinpos = s-starts;
7002 endinpos = startinpos + 1;
7003 if (unicode_decode_call_errorhandler_writer(
7004 errors, &error_handler_obj,
7005 "ascii", "ordinal not in range(128)",
7006 &starts, &e, &startinpos, &endinpos, &exc, &s,
7007 &writer))
7008 goto onError;
7009 kind = writer.kind;
7010 data = writer.data;
7011 }
7012 }
7013 Py_XDECREF(error_handler_obj);
7014 Py_XDECREF(exc);
7015 return _PyUnicodeWriter_Finish(&writer);
7016
7017 onError:
7018 _PyUnicodeWriter_Dealloc(&writer);
7019 Py_XDECREF(error_handler_obj);
7020 Py_XDECREF(exc);
7021 return NULL;
7022}
7023
7024/* Deprecated */
7025PyObject *
7026PyUnicode_EncodeASCII(const Py_UNICODE *p,
7027 Py_ssize_t size,
7028 const char *errors)
7029{
7030 PyObject *result;
7031 PyObject *unicode = PyUnicode_FromWideChar(p, size);
7032 if (unicode == NULL)
7033 return NULL;
7034 result = unicode_encode_ucs1(unicode, errors, 128);
7035 Py_DECREF(unicode);
7036 return result;
7037}
7038
7039PyObject *
7040_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7041{
7042 if (!PyUnicode_Check(unicode)) {
7043 PyErr_BadArgument();
7044 return NULL;
7045 }
7046 if (PyUnicode_READY(unicode) == -1)
7047 return NULL;
7048 /* Fast path: if it is an ASCII-only string, construct bytes object
7049 directly. Else defer to above function to raise the exception. */
7050 if (PyUnicode_IS_ASCII(unicode))
7051 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7052 PyUnicode_GET_LENGTH(unicode));
7053 return unicode_encode_ucs1(unicode, errors, 128);
7054}
7055
7056PyObject *
7057PyUnicode_AsASCIIString(PyObject *unicode)
7058{
7059 return _PyUnicode_AsASCIIString(unicode, NULL);
7060}
7061
7062#ifdef MS_WINDOWS
7063
7064/* --- MBCS codecs for Windows -------------------------------------------- */
7065
7066#if SIZEOF_INT < SIZEOF_SIZE_T
7067#define NEED_RETRY
7068#endif
7069
7070#ifndef WC_ERR_INVALID_CHARS
7071# define WC_ERR_INVALID_CHARS 0x0080
7072#endif
7073
7074static const char*
7075code_page_name(UINT code_page, PyObject **obj)
7076{
7077 *obj = NULL;
7078 if (code_page == CP_ACP)
7079 return "mbcs";
7080 if (code_page == CP_UTF7)
7081 return "CP_UTF7";
7082 if (code_page == CP_UTF8)
7083 return "CP_UTF8";
7084
7085 *obj = PyBytes_FromFormat("cp%u", code_page);
7086 if (*obj == NULL)
7087 return NULL;
7088 return PyBytes_AS_STRING(*obj);
7089}
7090
7091static DWORD
7092decode_code_page_flags(UINT code_page)
7093{
7094 if (code_page == CP_UTF7) {
7095 /* The CP_UTF7 decoder only supports flags=0 */
7096 return 0;
7097 }
7098 else
7099 return MB_ERR_INVALID_CHARS;
7100}
7101
7102/*
7103 * Decode a byte string from a Windows code page into unicode object in strict
7104 * mode.
7105 *
7106 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7107 * OSError and returns -1 on other error.
7108 */
7109static int
7110decode_code_page_strict(UINT code_page,
7111 PyObject **v,
7112 const char *in,
7113 int insize)
7114{
7115 const DWORD flags = decode_code_page_flags(code_page);
7116 wchar_t *out;
7117 DWORD outsize;
7118
7119 /* First get the size of the result */
7120 assert(insize > 0);
7121 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7122 if (outsize <= 0)
7123 goto error;
7124
7125 if (*v == NULL) {
7126 /* Create unicode object */
7127 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7128 *v = (PyObject*)_PyUnicode_New(outsize);
7129 if (*v == NULL)
7130 return -1;
7131 out = PyUnicode_AS_UNICODE(*v);
7132 }
7133 else {
7134 /* Extend unicode object */
7135 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7136 if (unicode_resize(v, n + outsize) < 0)
7137 return -1;
7138 out = PyUnicode_AS_UNICODE(*v) + n;
7139 }
7140
7141 /* Do the conversion */
7142 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7143 if (outsize <= 0)
7144 goto error;
7145 return insize;
7146
7147error:
7148 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7149 return -2;
7150 PyErr_SetFromWindowsErr(0);
7151 return -1;
7152}
7153
7154/*
7155 * Decode a byte string from a code page into unicode object with an error
7156 * handler.
7157 *
7158 * Returns consumed size if succeed, or raise an OSError or
7159 * UnicodeDecodeError exception and returns -1 on error.
7160 */
7161static int
7162decode_code_page_errors(UINT code_page,
7163 PyObject **v,
7164 const char *in, const int size,
7165 const char *errors, int final)
7166{
7167 const char *startin = in;
7168 const char *endin = in + size;
7169 const DWORD flags = decode_code_page_flags(code_page);
7170 /* Ideally, we should get reason from FormatMessage. This is the Windows
7171 2000 English version of the message. */
7172 const char *reason = "No mapping for the Unicode character exists "
7173 "in the target code page.";
7174 /* each step cannot decode more than 1 character, but a character can be
7175 represented as a surrogate pair */
7176 wchar_t buffer[2], *startout, *out;
7177 int insize;
7178 Py_ssize_t outsize;
7179 PyObject *errorHandler = NULL;
7180 PyObject *exc = NULL;
7181 PyObject *encoding_obj = NULL;
7182 const char *encoding;
7183 DWORD err;
7184 int ret = -1;
7185
7186 assert(size > 0);
7187
7188 encoding = code_page_name(code_page, &encoding_obj);
7189 if (encoding == NULL)
7190 return -1;
7191
7192 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7193 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7194 UnicodeDecodeError. */
7195 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7196 if (exc != NULL) {
7197 PyCodec_StrictErrors(exc);
7198 Py_CLEAR(exc);
7199 }
7200 goto error;
7201 }
7202
7203 if (*v == NULL) {
7204 /* Create unicode object */
7205 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7206 PyErr_NoMemory();
7207 goto error;
7208 }
7209 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7210 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7211 if (*v == NULL)
7212 goto error;
7213 startout = PyUnicode_AS_UNICODE(*v);
7214 }
7215 else {
7216 /* Extend unicode object */
7217 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7218 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7219 PyErr_NoMemory();
7220 goto error;
7221 }
7222 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7223 goto error;
7224 startout = PyUnicode_AS_UNICODE(*v) + n;
7225 }
7226
7227 /* Decode the byte string character per character */
7228 out = startout;
7229 while (in < endin)
7230 {
7231 /* Decode a character */
7232 insize = 1;
7233 do
7234 {
7235 outsize = MultiByteToWideChar(code_page, flags,
7236 in, insize,
7237 buffer, Py_ARRAY_LENGTH(buffer));
7238 if (outsize > 0)
7239 break;
7240 err = GetLastError();
7241 if (err != ERROR_NO_UNICODE_TRANSLATION
7242 && err != ERROR_INSUFFICIENT_BUFFER)
7243 {
7244 PyErr_SetFromWindowsErr(0);
7245 goto error;
7246 }
7247 insize++;
7248 }
7249 /* 4=maximum length of a UTF-8 sequence */
7250 while (insize <= 4 && (in + insize) <= endin);
7251
7252 if (outsize <= 0) {
7253 Py_ssize_t startinpos, endinpos, outpos;
7254
7255 /* last character in partial decode? */
7256 if (in + insize >= endin && !final)
7257 break;
7258
7259 startinpos = in - startin;
7260 endinpos = startinpos + 1;
7261 outpos = out - PyUnicode_AS_UNICODE(*v);
7262 if (unicode_decode_call_errorhandler_wchar(
7263 errors, &errorHandler,
7264 encoding, reason,
7265 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7266 v, &outpos))
7267 {
7268 goto error;
7269 }
7270 out = PyUnicode_AS_UNICODE(*v) + outpos;
7271 }
7272 else {
7273 in += insize;
7274 memcpy(out, buffer, outsize * sizeof(wchar_t));
7275 out += outsize;
7276 }
7277 }
7278
7279 /* write a NUL character at the end */
7280 *out = 0;
7281
7282 /* Extend unicode object */
7283 outsize = out - startout;
7284 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7285 if (unicode_resize(v, outsize) < 0)
7286 goto error;
7287 /* (in - startin) <= size and size is an int */
7288 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7289
7290error:
7291 Py_XDECREF(encoding_obj);
7292 Py_XDECREF(errorHandler);
7293 Py_XDECREF(exc);
7294 return ret;
7295}
7296
7297static PyObject *
7298decode_code_page_stateful(int code_page,
7299 const char *s, Py_ssize_t size,
7300 const char *errors, Py_ssize_t *consumed)
7301{
7302 PyObject *v = NULL;
7303 int chunk_size, final, converted, done;
7304
7305 if (code_page < 0) {
7306 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7307 return NULL;
7308 }
7309 if (size < 0) {
7310 PyErr_BadInternalCall();
7311 return NULL;
7312 }
7313
7314 if (consumed)
7315 *consumed = 0;
7316
7317 do
7318 {
7319#ifdef NEED_RETRY
7320 if (size > INT_MAX) {
7321 chunk_size = INT_MAX;
7322 final = 0;
7323 done = 0;
7324 }
7325 else
7326#endif
7327 {
7328 chunk_size = (int)size;
7329 final = (consumed == NULL);
7330 done = 1;
7331 }
7332
7333 if (chunk_size == 0 && done) {
7334 if (v != NULL)
7335 break;
7336 _Py_RETURN_UNICODE_EMPTY();
7337 }
7338
7339 converted = decode_code_page_strict(code_page, &v,
7340 s, chunk_size);
7341 if (converted == -2)
7342 converted = decode_code_page_errors(code_page, &v,
7343 s, chunk_size,
7344 errors, final);
7345 assert(converted != 0 || done);
7346
7347 if (converted < 0) {
7348 Py_XDECREF(v);
7349 return NULL;
7350 }
7351
7352 if (consumed)
7353 *consumed += converted;
7354
7355 s += converted;
7356 size -= converted;
7357 } while (!done);
7358
7359 return unicode_result(v);
7360}
7361
7362PyObject *
7363PyUnicode_DecodeCodePageStateful(int code_page,
7364 const char *s,
7365 Py_ssize_t size,
7366 const char *errors,
7367 Py_ssize_t *consumed)
7368{
7369 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7370}
7371
7372PyObject *
7373PyUnicode_DecodeMBCSStateful(const char *s,
7374 Py_ssize_t size,
7375 const char *errors,
7376 Py_ssize_t *consumed)
7377{
7378 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7379}
7380
7381PyObject *
7382PyUnicode_DecodeMBCS(const char *s,
7383 Py_ssize_t size,
7384 const char *errors)
7385{
7386 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7387}
7388
7389static DWORD
7390encode_code_page_flags(UINT code_page, const char *errors)
7391{
7392 if (code_page == CP_UTF8) {
7393 return WC_ERR_INVALID_CHARS;
7394 }
7395 else if (code_page == CP_UTF7) {
7396 /* CP_UTF7 only supports flags=0 */
7397 return 0;
7398 }
7399 else {
7400 if (errors != NULL && strcmp(errors, "replace") == 0)
7401 return 0;
7402 else
7403 return WC_NO_BEST_FIT_CHARS;
7404 }
7405}
7406
7407/*
7408 * Encode a Unicode string to a Windows code page into a byte string in strict
7409 * mode.
7410 *
7411 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7412 * an OSError and returns -1 on other error.
7413 */
7414static int
7415encode_code_page_strict(UINT code_page, PyObject **outbytes,
7416 PyObject *unicode, Py_ssize_t offset, int len,
7417 const char* errors)
7418{
7419 BOOL usedDefaultChar = FALSE;
7420 BOOL *pusedDefaultChar = &usedDefaultChar;
7421 int outsize;
7422 wchar_t *p;
7423 Py_ssize_t size;
7424 const DWORD flags = encode_code_page_flags(code_page, NULL);
7425 char *out;
7426 /* Create a substring so that we can get the UTF-16 representation
7427 of just the slice under consideration. */
7428 PyObject *substring;
7429
7430 assert(len > 0);
7431
7432 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7433 pusedDefaultChar = &usedDefaultChar;
7434 else
7435 pusedDefaultChar = NULL;
7436
7437 substring = PyUnicode_Substring(unicode, offset, offset+len);
7438 if (substring == NULL)
7439 return -1;
7440 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7441 if (p == NULL) {
7442 Py_DECREF(substring);
7443 return -1;
7444 }
7445 assert(size <= INT_MAX);
7446
7447 /* First get the size of the result */
7448 outsize = WideCharToMultiByte(code_page, flags,
7449 p, (int)size,
7450 NULL, 0,
7451 NULL, pusedDefaultChar);
7452 if (outsize <= 0)
7453 goto error;
7454 /* If we used a default char, then we failed! */
7455 if (pusedDefaultChar && *pusedDefaultChar) {
7456 Py_DECREF(substring);
7457 return -2;
7458 }
7459
7460 if (*outbytes == NULL) {
7461 /* Create string object */
7462 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7463 if (*outbytes == NULL) {
7464 Py_DECREF(substring);
7465 return -1;
7466 }
7467 out = PyBytes_AS_STRING(*outbytes);
7468 }
7469 else {
7470 /* Extend string object */
7471 const Py_ssize_t n = PyBytes_Size(*outbytes);
7472 if (outsize > PY_SSIZE_T_MAX - n) {
7473 PyErr_NoMemory();
7474 Py_DECREF(substring);
7475 return -1;
7476 }
7477 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7478 Py_DECREF(substring);
7479 return -1;
7480 }
7481 out = PyBytes_AS_STRING(*outbytes) + n;
7482 }
7483
7484 /* Do the conversion */
7485 outsize = WideCharToMultiByte(code_page, flags,
7486 p, (int)size,
7487 out, outsize,
7488 NULL, pusedDefaultChar);
7489 Py_CLEAR(substring);
7490 if (outsize <= 0)
7491 goto error;
7492 if (pusedDefaultChar && *pusedDefaultChar)
7493 return -2;
7494 return 0;
7495
7496error:
7497 Py_XDECREF(substring);
7498 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7499 return -2;
7500 PyErr_SetFromWindowsErr(0);
7501 return -1;
7502}
7503
7504/*
7505 * Encode a Unicode string to a Windows code page into a byte string using an
7506 * error handler.
7507 *
7508 * Returns consumed characters if succeed, or raise an OSError and returns
7509 * -1 on other error.
7510 */
7511static int
7512encode_code_page_errors(UINT code_page, PyObject **outbytes,
7513 PyObject *unicode, Py_ssize_t unicode_offset,
7514 Py_ssize_t insize, const char* errors)
7515{
7516 const DWORD flags = encode_code_page_flags(code_page, errors);
7517 Py_ssize_t pos = unicode_offset;
7518 Py_ssize_t endin = unicode_offset + insize;
7519 /* Ideally, we should get reason from FormatMessage. This is the Windows
7520 2000 English version of the message. */
7521 const char *reason = "invalid character";
7522 /* 4=maximum length of a UTF-8 sequence */
7523 char buffer[4];
7524 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7525 Py_ssize_t outsize;
7526 char *out;
7527 PyObject *errorHandler = NULL;
7528 PyObject *exc = NULL;
7529 PyObject *encoding_obj = NULL;
7530 const char *encoding;
7531 Py_ssize_t newpos, newoutsize;
7532 PyObject *rep;
7533 int ret = -1;
7534
7535 assert(insize > 0);
7536
7537 encoding = code_page_name(code_page, &encoding_obj);
7538 if (encoding == NULL)
7539 return -1;
7540
7541 if (errors == NULL || strcmp(errors, "strict") == 0) {
7542 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7543 then we raise a UnicodeEncodeError. */
7544 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7545 if (exc != NULL) {
7546 PyCodec_StrictErrors(exc);
7547 Py_DECREF(exc);
7548 }
7549 Py_XDECREF(encoding_obj);
7550 return -1;
7551 }
7552
7553 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7554 pusedDefaultChar = &usedDefaultChar;
7555 else
7556 pusedDefaultChar = NULL;
7557
7558 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7559 PyErr_NoMemory();
7560 goto error;
7561 }
7562 outsize = insize * Py_ARRAY_LENGTH(buffer);
7563
7564 if (*outbytes == NULL) {
7565 /* Create string object */
7566 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7567 if (*outbytes == NULL)
7568 goto error;
7569 out = PyBytes_AS_STRING(*outbytes);
7570 }
7571 else {
7572 /* Extend string object */
7573 Py_ssize_t n = PyBytes_Size(*outbytes);
7574 if (n > PY_SSIZE_T_MAX - outsize) {
7575 PyErr_NoMemory();
7576 goto error;
7577 }
7578 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7579 goto error;
7580 out = PyBytes_AS_STRING(*outbytes) + n;
7581 }
7582
7583 /* Encode the string character per character */
7584 while (pos < endin)
7585 {
7586 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7587 wchar_t chars[2];
7588 int charsize;
7589 if (ch < 0x10000) {
7590 chars[0] = (wchar_t)ch;
7591 charsize = 1;
7592 }
7593 else {
7594 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7595 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7596 charsize = 2;
7597 }
7598
7599 outsize = WideCharToMultiByte(code_page, flags,
7600 chars, charsize,
7601 buffer, Py_ARRAY_LENGTH(buffer),
7602 NULL, pusedDefaultChar);
7603 if (outsize > 0) {
7604 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7605 {
7606 pos++;
7607 memcpy(out, buffer, outsize);
7608 out += outsize;
7609 continue;
7610 }
7611 }
7612 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7613 PyErr_SetFromWindowsErr(0);
7614 goto error;
7615 }
7616
7617 rep = unicode_encode_call_errorhandler(
7618 errors, &errorHandler, encoding, reason,
7619 unicode, &exc,
7620 pos, pos + 1, &newpos);
7621 if (rep == NULL)
7622 goto error;
7623 pos = newpos;
7624
7625 if (PyBytes_Check(rep)) {
7626 outsize = PyBytes_GET_SIZE(rep);
7627 if (outsize != 1) {
7628 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7629 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7630 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7631 Py_DECREF(rep);
7632 goto error;
7633 }
7634 out = PyBytes_AS_STRING(*outbytes) + offset;
7635 }
7636 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7637 out += outsize;
7638 }
7639 else {
7640 Py_ssize_t i;
7641 enum PyUnicode_Kind kind;
7642 void *data;
7643
7644 if (PyUnicode_READY(rep) == -1) {
7645 Py_DECREF(rep);
7646 goto error;
7647 }
7648
7649 outsize = PyUnicode_GET_LENGTH(rep);
7650 if (outsize != 1) {
7651 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7652 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7653 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7654 Py_DECREF(rep);
7655 goto error;
7656 }
7657 out = PyBytes_AS_STRING(*outbytes) + offset;
7658 }
7659 kind = PyUnicode_KIND(rep);
7660 data = PyUnicode_DATA(rep);
7661 for (i=0; i < outsize; i++) {
7662 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7663 if (ch > 127) {
7664 raise_encode_exception(&exc,
7665 encoding, unicode,
7666 pos, pos + 1,
7667 "unable to encode error handler result to ASCII");
7668 Py_DECREF(rep);
7669 goto error;
7670 }
7671 *out = (unsigned char)ch;
7672 out++;
7673 }
7674 }
7675 Py_DECREF(rep);
7676 }
7677 /* write a NUL byte */
7678 *out = 0;
7679 outsize = out - PyBytes_AS_STRING(*outbytes);
7680 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7681 if (_PyBytes_Resize(outbytes, outsize) < 0)
7682 goto error;
7683 ret = 0;
7684
7685error:
7686 Py_XDECREF(encoding_obj);
7687 Py_XDECREF(errorHandler);
7688 Py_XDECREF(exc);
7689 return ret;
7690}
7691
7692static PyObject *
7693encode_code_page(int code_page,
7694 PyObject *unicode,
7695 const char *errors)
7696{
7697 Py_ssize_t len;
7698 PyObject *outbytes = NULL;
7699 Py_ssize_t offset;
7700 int chunk_len, ret, done;
7701
7702 if (!PyUnicode_Check(unicode)) {
7703 PyErr_BadArgument();
7704 return NULL;
7705 }
7706
7707 if (PyUnicode_READY(unicode) == -1)
7708 return NULL;
7709 len = PyUnicode_GET_LENGTH(unicode);
7710
7711 if (code_page < 0) {
7712 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7713 return NULL;
7714 }
7715
7716 if (len == 0)
7717 return PyBytes_FromStringAndSize(NULL, 0);
7718
7719 offset = 0;
7720 do
7721 {
7722#ifdef NEED_RETRY
7723 /* UTF-16 encoding may double the size, so use only INT_MAX/2
7724 chunks. */
7725 if (len > INT_MAX/2) {
7726 chunk_len = INT_MAX/2;
7727 done = 0;
7728 }
7729 else
7730#endif
7731 {
7732 chunk_len = (int)len;
7733 done = 1;
7734 }
7735
7736 ret = encode_code_page_strict(code_page, &outbytes,
7737 unicode, offset, chunk_len,
7738 errors);
7739 if (ret == -2)
7740 ret = encode_code_page_errors(code_page, &outbytes,
7741 unicode, offset,
7742 chunk_len, errors);
7743 if (ret < 0) {
7744 Py_XDECREF(outbytes);
7745 return NULL;
7746 }
7747
7748 offset += chunk_len;
7749 len -= chunk_len;
7750 } while (!done);
7751
7752 return outbytes;
7753}
7754
7755PyObject *
7756PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7757 Py_ssize_t size,
7758 const char *errors)
7759{
7760 PyObject *unicode, *res;
7761 unicode = PyUnicode_FromWideChar(p, size);
7762 if (unicode == NULL)
7763 return NULL;
7764 res = encode_code_page(CP_ACP, unicode, errors);
7765 Py_DECREF(unicode);
7766 return res;
7767}
7768
7769PyObject *
7770PyUnicode_EncodeCodePage(int code_page,
7771 PyObject *unicode,
7772 const char *errors)
7773{
7774 return encode_code_page(code_page, unicode, errors);
7775}
7776
7777PyObject *
7778PyUnicode_AsMBCSString(PyObject *unicode)
7779{
7780 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7781}
7782
7783#undef NEED_RETRY
7784
7785#endif /* MS_WINDOWS */
7786
7787/* --- Character Mapping Codec -------------------------------------------- */
7788
7789static int
7790charmap_decode_string(const char *s,
7791 Py_ssize_t size,
7792 PyObject *mapping,
7793 const char *errors,
7794 _PyUnicodeWriter *writer)
7795{
7796 const char *starts = s;
7797 const char *e;
7798 Py_ssize_t startinpos, endinpos;
7799 PyObject *errorHandler = NULL, *exc = NULL;
7800 Py_ssize_t maplen;
7801 enum PyUnicode_Kind mapkind;
7802 void *mapdata;
7803 Py_UCS4 x;
7804 unsigned char ch;
7805
7806 if (PyUnicode_READY(mapping) == -1)
7807 return -1;
7808
7809 maplen = PyUnicode_GET_LENGTH(mapping);
7810 mapdata = PyUnicode_DATA(mapping);
7811 mapkind = PyUnicode_KIND(mapping);
7812
7813 e = s + size;
7814
7815 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7816 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7817 * is disabled in encoding aliases, latin1 is preferred because
7818 * its implementation is faster. */
7819 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7820 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7821 Py_UCS4 maxchar = writer->maxchar;
7822
7823 assert (writer->kind == PyUnicode_1BYTE_KIND);
7824 while (s < e) {
7825 ch = *s;
7826 x = mapdata_ucs1[ch];
7827 if (x > maxchar) {
7828 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7829 goto onError;
7830 maxchar = writer->maxchar;
7831 outdata = (Py_UCS1 *)writer->data;
7832 }
7833 outdata[writer->pos] = x;
7834 writer->pos++;
7835 ++s;
7836 }
7837 return 0;
7838 }
7839
7840 while (s < e) {
7841 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7842 enum PyUnicode_Kind outkind = writer->kind;
7843 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7844 if (outkind == PyUnicode_1BYTE_KIND) {
7845 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7846 Py_UCS4 maxchar = writer->maxchar;
7847 while (s < e) {
7848 ch = *s;
7849 x = mapdata_ucs2[ch];
7850 if (x > maxchar)
7851 goto Error;
7852 outdata[writer->pos] = x;
7853 writer->pos++;
7854 ++s;
7855 }
7856 break;
7857 }
7858 else if (outkind == PyUnicode_2BYTE_KIND) {
7859 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7860 while (s < e) {
7861 ch = *s;
7862 x = mapdata_ucs2[ch];
7863 if (x == 0xFFFE)
7864 goto Error;
7865 outdata[writer->pos] = x;
7866 writer->pos++;
7867 ++s;
7868 }
7869 break;
7870 }
7871 }
7872 ch = *s;
7873
7874 if (ch < maplen)
7875 x = PyUnicode_READ(mapkind, mapdata, ch);
7876 else
7877 x = 0xfffe; /* invalid value */
7878Error:
7879 if (x == 0xfffe)
7880 {
7881 /* undefined mapping */
7882 startinpos = s-starts;
7883 endinpos = startinpos+1;
7884 if (unicode_decode_call_errorhandler_writer(
7885 errors, &errorHandler,
7886 "charmap", "character maps to <undefined>",
7887 &starts, &e, &startinpos, &endinpos, &exc, &s,
7888 writer)) {
7889 goto onError;
7890 }
7891 continue;
7892 }
7893
7894 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7895 goto onError;
7896 ++s;
7897 }
7898 Py_XDECREF(errorHandler);
7899 Py_XDECREF(exc);
7900 return 0;
7901
7902onError:
7903 Py_XDECREF(errorHandler);
7904 Py_XDECREF(exc);
7905 return -1;
7906}
7907
7908static int
7909charmap_decode_mapping(const char *s,
7910 Py_ssize_t size,
7911 PyObject *mapping,
7912 const char *errors,
7913 _PyUnicodeWriter *writer)
7914{
7915 const char *starts = s;
7916 const char *e;
7917 Py_ssize_t startinpos, endinpos;
7918 PyObject *errorHandler = NULL, *exc = NULL;
7919 unsigned char ch;
7920 PyObject *key, *item = NULL;
7921
7922 e = s + size;
7923
7924 while (s < e) {
7925 ch = *s;
7926
7927 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7928 key = PyLong_FromLong((long)ch);
7929 if (key == NULL)
7930 goto onError;
7931
7932 item = PyObject_GetItem(mapping, key);
7933 Py_DECREF(key);
7934 if (item == NULL) {
7935 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7936 /* No mapping found means: mapping is undefined. */
7937 PyErr_Clear();
7938 goto Undefined;
7939 } else
7940 goto onError;
7941 }
7942
7943 /* Apply mapping */
7944 if (item == Py_None)
7945 goto Undefined;
7946 if (PyLong_Check(item)) {
7947 long value = PyLong_AS_LONG(item);
7948 if (value == 0xFFFE)
7949 goto Undefined;
7950 if (value < 0 || value > MAX_UNICODE) {
7951 PyErr_Format(PyExc_TypeError,
7952 "character mapping must be in range(0x%lx)",
7953 (unsigned long)MAX_UNICODE + 1);
7954 goto onError;
7955 }
7956
7957 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7958 goto onError;
7959 }
7960 else if (PyUnicode_Check(item)) {
7961 if (PyUnicode_READY(item) == -1)
7962 goto onError;
7963 if (PyUnicode_GET_LENGTH(item) == 1) {
7964 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7965 if (value == 0xFFFE)
7966 goto Undefined;
7967 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7968 goto onError;
7969 }
7970 else {
7971 writer->overallocate = 1;
7972 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7973 goto onError;
7974 }
7975 }
7976 else {
7977 /* wrong return value */
7978 PyErr_SetString(PyExc_TypeError,
7979 "character mapping must return integer, None or str");
7980 goto onError;
7981 }
7982 Py_CLEAR(item);
7983 ++s;
7984 continue;
7985
7986Undefined:
7987 /* undefined mapping */
7988 Py_CLEAR(item);
7989 startinpos = s-starts;
7990 endinpos = startinpos+1;
7991 if (unicode_decode_call_errorhandler_writer(
7992 errors, &errorHandler,
7993 "charmap", "character maps to <undefined>",
7994 &starts, &e, &startinpos, &endinpos, &exc, &s,
7995 writer)) {
7996 goto onError;
7997 }
7998 }
7999 Py_XDECREF(errorHandler);
8000 Py_XDECREF(exc);
8001 return 0;
8002
8003onError:
8004 Py_XDECREF(item);
8005 Py_XDECREF(errorHandler);
8006 Py_XDECREF(exc);
8007 return -1;
8008}
8009
8010PyObject *
8011PyUnicode_DecodeCharmap(const char *s,
8012 Py_ssize_t size,
8013 PyObject *mapping,
8014 const char *errors)
8015{
8016 _PyUnicodeWriter writer;
8017
8018 /* Default to Latin-1 */
8019 if (mapping == NULL)
8020 return PyUnicode_DecodeLatin1(s, size, errors);
8021
8022 if (size == 0)
8023 _Py_RETURN_UNICODE_EMPTY();
8024 _PyUnicodeWriter_Init(&writer);
8025 writer.min_length = size;
8026 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8027 goto onError;
8028
8029 if (PyUnicode_CheckExact(mapping)) {
8030 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8031 goto onError;
8032 }
8033 else {
8034 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8035 goto onError;
8036 }
8037 return _PyUnicodeWriter_Finish(&writer);
8038
8039 onError:
8040 _PyUnicodeWriter_Dealloc(&writer);
8041 return NULL;
8042}
8043
8044/* Charmap encoding: the lookup table */
8045
8046struct encoding_map {
8047 PyObject_HEAD
8048 unsigned char level1[32];
8049 int count2, count3;
8050 unsigned char level23[1];
8051};
8052
8053static PyObject*
8054encoding_map_size(PyObject *obj, PyObject* args)
8055{
8056 struct encoding_map *map = (struct encoding_map*)obj;
8057 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8058 128*map->count3);
8059}
8060
8061static PyMethodDef encoding_map_methods[] = {
8062 {"size", encoding_map_size, METH_NOARGS,
8063 PyDoc_STR("Return the size (in bytes) of this object") },
8064 { 0 }
8065};
8066
8067static void
8068encoding_map_dealloc(PyObject* o)
8069{
8070 PyObject_FREE(o);
8071}
8072
8073static PyTypeObject EncodingMapType = {
8074 PyVarObject_HEAD_INIT(NULL, 0)
8075 "EncodingMap", /*tp_name*/
8076 sizeof(struct encoding_map), /*tp_basicsize*/
8077 0, /*tp_itemsize*/
8078 /* methods */
8079 encoding_map_dealloc, /*tp_dealloc*/
8080 0, /*tp_print*/
8081 0, /*tp_getattr*/
8082 0, /*tp_setattr*/
8083 0, /*tp_reserved*/
8084 0, /*tp_repr*/
8085 0, /*tp_as_number*/
8086 0, /*tp_as_sequence*/
8087 0, /*tp_as_mapping*/
8088 0, /*tp_hash*/
8089 0, /*tp_call*/
8090 0, /*tp_str*/
8091 0, /*tp_getattro*/
8092 0, /*tp_setattro*/
8093 0, /*tp_as_buffer*/
8094 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8095 0, /*tp_doc*/
8096 0, /*tp_traverse*/
8097 0, /*tp_clear*/
8098 0, /*tp_richcompare*/
8099 0, /*tp_weaklistoffset*/
8100 0, /*tp_iter*/
8101 0, /*tp_iternext*/
8102 encoding_map_methods, /*tp_methods*/
8103 0, /*tp_members*/
8104 0, /*tp_getset*/
8105 0, /*tp_base*/
8106 0, /*tp_dict*/
8107 0, /*tp_descr_get*/
8108 0, /*tp_descr_set*/
8109 0, /*tp_dictoffset*/
8110 0, /*tp_init*/
8111 0, /*tp_alloc*/
8112 0, /*tp_new*/
8113 0, /*tp_free*/
8114 0, /*tp_is_gc*/
8115};
8116
8117PyObject*
8118PyUnicode_BuildEncodingMap(PyObject* string)
8119{
8120 PyObject *result;
8121 struct encoding_map *mresult;
8122 int i;
8123 int need_dict = 0;
8124 unsigned char level1[32];
8125 unsigned char level2[512];
8126 unsigned char *mlevel1, *mlevel2, *mlevel3;
8127 int count2 = 0, count3 = 0;
8128 int kind;
8129 void *data;
8130 Py_ssize_t length;
8131 Py_UCS4 ch;
8132
8133 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8134 PyErr_BadArgument();
8135 return NULL;
8136 }
8137 kind = PyUnicode_KIND(string);
8138 data = PyUnicode_DATA(string);
8139 length = PyUnicode_GET_LENGTH(string);
8140 length = Py_MIN(length, 256);
8141 memset(level1, 0xFF, sizeof level1);
8142 memset(level2, 0xFF, sizeof level2);
8143
8144 /* If there isn't a one-to-one mapping of NULL to \0,
8145 or if there are non-BMP characters, we need to use
8146 a mapping dictionary. */
8147 if (PyUnicode_READ(kind, data, 0) != 0)
8148 need_dict = 1;
8149 for (i = 1; i < length; i++) {
8150 int l1, l2;
8151 ch = PyUnicode_READ(kind, data, i);
8152 if (ch == 0 || ch > 0xFFFF) {
8153 need_dict = 1;
8154 break;
8155 }
8156 if (ch == 0xFFFE)
8157 /* unmapped character */
8158 continue;
8159 l1 = ch >> 11;
8160 l2 = ch >> 7;
8161 if (level1[l1] == 0xFF)
8162 level1[l1] = count2++;
8163 if (level2[l2] == 0xFF)
8164 level2[l2] = count3++;
8165 }
8166
8167 if (count2 >= 0xFF || count3 >= 0xFF)
8168 need_dict = 1;
8169
8170 if (need_dict) {
8171 PyObject *result = PyDict_New();
8172 PyObject *key, *value;
8173 if (!result)
8174 return NULL;
8175 for (i = 0; i < length; i++) {
8176 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8177 value = PyLong_FromLong(i);
8178 if (!key || !value)
8179 goto failed1;
8180 if (PyDict_SetItem(result, key, value) == -1)
8181 goto failed1;
8182 Py_DECREF(key);
8183 Py_DECREF(value);
8184 }
8185 return result;
8186 failed1:
8187 Py_XDECREF(key);
8188 Py_XDECREF(value);
8189 Py_DECREF(result);
8190 return NULL;
8191 }
8192
8193 /* Create a three-level trie */
8194 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8195 16*count2 + 128*count3 - 1);
8196 if (!result)
8197 return PyErr_NoMemory();
8198 PyObject_Init(result, &EncodingMapType);
8199 mresult = (struct encoding_map*)result;
8200 mresult->count2 = count2;
8201 mresult->count3 = count3;
8202 mlevel1 = mresult->level1;
8203 mlevel2 = mresult->level23;
8204 mlevel3 = mresult->level23 + 16*count2;
8205 memcpy(mlevel1, level1, 32);
8206 memset(mlevel2, 0xFF, 16*count2);
8207 memset(mlevel3, 0, 128*count3);
8208 count3 = 0;
8209 for (i = 1; i < length; i++) {
8210 int o1, o2, o3, i2, i3;
8211 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8212 if (ch == 0xFFFE)
8213 /* unmapped character */
8214 continue;
8215 o1 = ch>>11;
8216 o2 = (ch>>7) & 0xF;
8217 i2 = 16*mlevel1[o1] + o2;
8218 if (mlevel2[i2] == 0xFF)
8219 mlevel2[i2] = count3++;
8220 o3 = ch & 0x7F;
8221 i3 = 128*mlevel2[i2] + o3;
8222 mlevel3[i3] = i;
8223 }
8224 return result;
8225}
8226
8227static int
8228encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8229{
8230 struct encoding_map *map = (struct encoding_map*)mapping;
8231 int l1 = c>>11;
8232 int l2 = (c>>7) & 0xF;
8233 int l3 = c & 0x7F;
8234 int i;
8235
8236 if (c > 0xFFFF)
8237 return -1;
8238 if (c == 0)
8239 return 0;
8240 /* level 1*/
8241 i = map->level1[l1];
8242 if (i == 0xFF) {
8243 return -1;
8244 }
8245 /* level 2*/
8246 i = map->level23[16*i+l2];
8247 if (i == 0xFF) {
8248 return -1;
8249 }
8250 /* level 3 */
8251 i = map->level23[16*map->count2 + 128*i + l3];
8252 if (i == 0) {
8253 return -1;
8254 }
8255 return i;
8256}
8257
8258/* Lookup the character ch in the mapping. If the character
8259 can't be found, Py_None is returned (or NULL, if another
8260 error occurred). */
8261static PyObject *
8262charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8263{
8264 PyObject *w = PyLong_FromLong((long)c);
8265 PyObject *x;
8266
8267 if (w == NULL)
8268 return NULL;
8269 x = PyObject_GetItem(mapping, w);
8270 Py_DECREF(w);
8271 if (x == NULL) {
8272 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8273 /* No mapping found means: mapping is undefined. */
8274 PyErr_Clear();
8275 Py_RETURN_NONE;
8276 } else
8277 return NULL;
8278 }
8279 else if (x == Py_None)
8280 return x;
8281 else if (PyLong_Check(x)) {
8282 long value = PyLong_AS_LONG(x);
8283 if (value < 0 || value > 255) {
8284 PyErr_SetString(PyExc_TypeError,
8285 "character mapping must be in range(256)");
8286 Py_DECREF(x);
8287 return NULL;
8288 }
8289 return x;
8290 }
8291 else if (PyBytes_Check(x))
8292 return x;
8293 else {
8294 /* wrong return value */
8295 PyErr_Format(PyExc_TypeError,
8296 "character mapping must return integer, bytes or None, not %.400s",
8297 x->ob_type->tp_name);
8298 Py_DECREF(x);
8299 return NULL;
8300 }
8301}
8302
8303static int
8304charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8305{
8306 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8307 /* exponentially overallocate to minimize reallocations */
8308 if (requiredsize < 2*outsize)
8309 requiredsize = 2*outsize;
8310 if (_PyBytes_Resize(outobj, requiredsize))
8311 return -1;
8312 return 0;
8313}
8314
8315typedef enum charmapencode_result {
8316 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8317} charmapencode_result;
8318/* lookup the character, put the result in the output string and adjust
8319 various state variables. Resize the output bytes object if not enough
8320 space is available. Return a new reference to the object that
8321 was put in the output buffer, or Py_None, if the mapping was undefined
8322 (in which case no character was written) or NULL, if a
8323 reallocation error occurred. The caller must decref the result */
8324static charmapencode_result
8325charmapencode_output(Py_UCS4 c, PyObject *mapping,
8326 PyObject **outobj, Py_ssize_t *outpos)
8327{
8328 PyObject *rep;
8329 char *outstart;
8330 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8331
8332 if (Py_TYPE(mapping) == &EncodingMapType) {
8333 int res = encoding_map_lookup(c, mapping);
8334 Py_ssize_t requiredsize = *outpos+1;
8335 if (res == -1)
8336 return enc_FAILED;
8337 if (outsize<requiredsize)
8338 if (charmapencode_resize(outobj, outpos, requiredsize))
8339 return enc_EXCEPTION;
8340 outstart = PyBytes_AS_STRING(*outobj);
8341 outstart[(*outpos)++] = (char)res;
8342 return enc_SUCCESS;
8343 }
8344
8345 rep = charmapencode_lookup(c, mapping);
8346 if (rep==NULL)
8347 return enc_EXCEPTION;
8348 else if (rep==Py_None) {
8349 Py_DECREF(rep);
8350 return enc_FAILED;
8351 } else {
8352 if (PyLong_Check(rep)) {
8353 Py_ssize_t requiredsize = *outpos+1;
8354 if (outsize<requiredsize)
8355 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8356 Py_DECREF(rep);
8357 return enc_EXCEPTION;
8358 }
8359 outstart = PyBytes_AS_STRING(*outobj);
8360 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8361 }
8362 else {
8363 const char *repchars = PyBytes_AS_STRING(rep);
8364 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8365 Py_ssize_t requiredsize = *outpos+repsize;
8366 if (outsize<requiredsize)
8367 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8368 Py_DECREF(rep);
8369 return enc_EXCEPTION;
8370 }
8371 outstart = PyBytes_AS_STRING(*outobj);
8372 memcpy(outstart + *outpos, repchars, repsize);
8373 *outpos += repsize;
8374 }
8375 }
8376 Py_DECREF(rep);
8377 return enc_SUCCESS;
8378}
8379
8380/* handle an error in PyUnicode_EncodeCharmap
8381 Return 0 on success, -1 on error */
8382static int
8383charmap_encoding_error(
8384 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8385 PyObject **exceptionObject,
8386 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8387 PyObject **res, Py_ssize_t *respos)
8388{
8389 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8390 Py_ssize_t size, repsize;
8391 Py_ssize_t newpos;
8392 enum PyUnicode_Kind kind;
8393 void *data;
8394 Py_ssize_t index;
8395 /* startpos for collecting unencodable chars */
8396 Py_ssize_t collstartpos = *inpos;
8397 Py_ssize_t collendpos = *inpos+1;
8398 Py_ssize_t collpos;
8399 char *encoding = "charmap";
8400 char *reason = "character maps to <undefined>";
8401 charmapencode_result x;
8402 Py_UCS4 ch;
8403 int val;
8404
8405 if (PyUnicode_READY(unicode) == -1)
8406 return -1;
8407 size = PyUnicode_GET_LENGTH(unicode);
8408 /* find all unencodable characters */
8409 while (collendpos < size) {
8410 PyObject *rep;
8411 if (Py_TYPE(mapping) == &EncodingMapType) {
8412 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8413 val = encoding_map_lookup(ch, mapping);
8414 if (val != -1)
8415 break;
8416 ++collendpos;
8417 continue;
8418 }
8419
8420 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8421 rep = charmapencode_lookup(ch, mapping);
8422 if (rep==NULL)
8423 return -1;
8424 else if (rep!=Py_None) {
8425 Py_DECREF(rep);
8426 break;
8427 }
8428 Py_DECREF(rep);
8429 ++collendpos;
8430 }
8431 /* cache callback name lookup
8432 * (if not done yet, i.e. it's the first error) */
8433 if (*error_handler == _Py_ERROR_UNKNOWN)
8434 *error_handler = get_error_handler(errors);
8435
8436 switch (*error_handler) {
8437 case _Py_ERROR_STRICT:
8438 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8439 return -1;
8440
8441 case _Py_ERROR_REPLACE:
8442 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8443 x = charmapencode_output('?', mapping, res, respos);
8444 if (x==enc_EXCEPTION) {
8445 return -1;
8446 }
8447 else if (x==enc_FAILED) {
8448 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8449 return -1;
8450 }
8451 }
8452 /* fall through */
8453 case _Py_ERROR_IGNORE:
8454 *inpos = collendpos;
8455 break;
8456
8457 case _Py_ERROR_XMLCHARREFREPLACE:
8458 /* generate replacement (temporarily (mis)uses p) */
8459 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8460 char buffer[2+29+1+1];
8461 char *cp;
8462 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8463 for (cp = buffer; *cp; ++cp) {
8464 x = charmapencode_output(*cp, mapping, res, respos);
8465 if (x==enc_EXCEPTION)
8466 return -1;
8467 else if (x==enc_FAILED) {
8468 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8469 return -1;
8470 }
8471 }
8472 }
8473 *inpos = collendpos;
8474 break;
8475
8476 default:
8477 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8478 encoding, reason, unicode, exceptionObject,
8479 collstartpos, collendpos, &newpos);
8480 if (repunicode == NULL)
8481 return -1;
8482 if (PyBytes_Check(repunicode)) {
8483 /* Directly copy bytes result to output. */
8484 Py_ssize_t outsize = PyBytes_Size(*res);
8485 Py_ssize_t requiredsize;
8486 repsize = PyBytes_Size(repunicode);
8487 requiredsize = *respos + repsize;
8488 if (requiredsize > outsize)
8489 /* Make room for all additional bytes. */
8490 if (charmapencode_resize(res, respos, requiredsize)) {
8491 Py_DECREF(repunicode);
8492 return -1;
8493 }
8494 memcpy(PyBytes_AsString(*res) + *respos,
8495 PyBytes_AsString(repunicode), repsize);
8496 *respos += repsize;
8497 *inpos = newpos;
8498 Py_DECREF(repunicode);
8499 break;
8500 }
8501 /* generate replacement */
8502 if (PyUnicode_READY(repunicode) == -1) {
8503 Py_DECREF(repunicode);
8504 return -1;
8505 }
8506 repsize = PyUnicode_GET_LENGTH(repunicode);
8507 data = PyUnicode_DATA(repunicode);
8508 kind = PyUnicode_KIND(repunicode);
8509 for (index = 0; index < repsize; index++) {
8510 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8511 x = charmapencode_output(repch, mapping, res, respos);
8512 if (x==enc_EXCEPTION) {
8513 Py_DECREF(repunicode);
8514 return -1;
8515 }
8516 else if (x==enc_FAILED) {
8517 Py_DECREF(repunicode);
8518 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8519 return -1;
8520 }
8521 }
8522 *inpos = newpos;
8523 Py_DECREF(repunicode);
8524 }
8525 return 0;
8526}
8527
8528PyObject *
8529_PyUnicode_EncodeCharmap(PyObject *unicode,
8530 PyObject *mapping,
8531 const char *errors)
8532{
8533 /* output object */
8534 PyObject *res = NULL;
8535 /* current input position */
8536 Py_ssize_t inpos = 0;
8537 Py_ssize_t size;
8538 /* current output position */
8539 Py_ssize_t respos = 0;
8540 PyObject *error_handler_obj = NULL;
8541 PyObject *exc = NULL;
8542 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8543 void *data;
8544 int kind;
8545
8546 if (PyUnicode_READY(unicode) == -1)
8547 return NULL;
8548 size = PyUnicode_GET_LENGTH(unicode);
8549 data = PyUnicode_DATA(unicode);
8550 kind = PyUnicode_KIND(unicode);
8551
8552 /* Default to Latin-1 */
8553 if (mapping == NULL)
8554 return unicode_encode_ucs1(unicode, errors, 256);
8555
8556 /* allocate enough for a simple encoding without
8557 replacements, if we need more, we'll resize */
8558 res = PyBytes_FromStringAndSize(NULL, size);
8559 if (res == NULL)
8560 goto onError;
8561 if (size == 0)
8562 return res;
8563
8564 while (inpos<size) {
8565 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8566 /* try to encode it */
8567 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8568 if (x==enc_EXCEPTION) /* error */
8569 goto onError;
8570 if (x==enc_FAILED) { /* unencodable character */
8571 if (charmap_encoding_error(unicode, &inpos, mapping,
8572 &exc,
8573 &error_handler, &error_handler_obj, errors,
8574 &res, &respos)) {
8575 goto onError;
8576 }
8577 }
8578 else
8579 /* done with this character => adjust input position */
8580 ++inpos;
8581 }
8582
8583 /* Resize if we allocated to much */
8584 if (respos<PyBytes_GET_SIZE(res))
8585 if (_PyBytes_Resize(&res, respos) < 0)
8586 goto onError;
8587
8588 Py_XDECREF(exc);
8589 Py_XDECREF(error_handler_obj);
8590 return res;
8591
8592 onError:
8593 Py_XDECREF(res);
8594 Py_XDECREF(exc);
8595 Py_XDECREF(error_handler_obj);
8596 return NULL;
8597}
8598
8599/* Deprecated */
8600PyObject *
8601PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8602 Py_ssize_t size,
8603 PyObject *mapping,
8604 const char *errors)
8605{
8606 PyObject *result;
8607 PyObject *unicode = PyUnicode_FromWideChar(p, size);
8608 if (unicode == NULL)
8609 return NULL;
8610 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8611 Py_DECREF(unicode);
8612 return result;
8613}
8614
8615PyObject *
8616PyUnicode_AsCharmapString(PyObject *unicode,
8617 PyObject *mapping)
8618{
8619 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8620 PyErr_BadArgument();
8621 return NULL;
8622 }
8623 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8624}
8625
8626/* create or adjust a UnicodeTranslateError */
8627static void
8628make_translate_exception(PyObject **exceptionObject,
8629 PyObject *unicode,
8630 Py_ssize_t startpos, Py_ssize_t endpos,
8631 const char *reason)
8632{
8633 if (*exceptionObject == NULL) {
8634 *exceptionObject = _PyUnicodeTranslateError_Create(
8635 unicode, startpos, endpos, reason);
8636 }
8637 else {
8638 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8639 goto onError;
8640 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8641 goto onError;
8642 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8643 goto onError;
8644 return;
8645 onError:
8646 Py_CLEAR(*exceptionObject);
8647 }
8648}
8649
8650/* error handling callback helper:
8651 build arguments, call the callback and check the arguments,
8652 put the result into newpos and return the replacement string, which
8653 has to be freed by the caller */
8654static PyObject *
8655unicode_translate_call_errorhandler(const char *errors,
8656 PyObject **errorHandler,
8657 const char *reason,
8658 PyObject *unicode, PyObject **exceptionObject,
8659 Py_ssize_t startpos, Py_ssize_t endpos,
8660 Py_ssize_t *newpos)
8661{
8662 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8663
8664 Py_ssize_t i_newpos;
8665 PyObject *restuple;
8666 PyObject *resunicode;
8667
8668 if (*errorHandler == NULL) {
8669 *errorHandler = PyCodec_LookupError(errors);
8670 if (*errorHandler == NULL)
8671 return NULL;
8672 }
8673
8674 make_translate_exception(exceptionObject,
8675 unicode, startpos, endpos, reason);
8676 if (*exceptionObject == NULL)
8677 return NULL;
8678
8679 restuple = PyObject_CallFunctionObjArgs(
8680 *errorHandler, *exceptionObject, NULL);
8681 if (restuple == NULL)
8682 return NULL;
8683 if (!PyTuple_Check(restuple)) {
8684 PyErr_SetString(PyExc_TypeError, &argparse[3]);
8685 Py_DECREF(restuple);
8686 return NULL;
8687 }
8688 if (!PyArg_ParseTuple(restuple, argparse,
8689 &resunicode, &i_newpos)) {
8690 Py_DECREF(restuple);
8691 return NULL;
8692 }
8693 if (i_newpos<0)
8694 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8695 else
8696 *newpos = i_newpos;
8697 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8698 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8699 Py_DECREF(restuple);
8700 return NULL;
8701 }
8702 Py_INCREF(resunicode);
8703 Py_DECREF(restuple);
8704 return resunicode;
8705}
8706
8707/* Lookup the character ch in the mapping and put the result in result,
8708 which must be decrefed by the caller.
8709 Return 0 on success, -1 on error */
8710static int
8711charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8712{
8713 PyObject *w = PyLong_FromLong((long)c);
8714 PyObject *x;
8715
8716 if (w == NULL)
8717 return -1;
8718 x = PyObject_GetItem(mapping, w);
8719 Py_DECREF(w);
8720 if (x == NULL) {
8721 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8722 /* No mapping found means: use 1:1 mapping. */
8723 PyErr_Clear();
8724 *result = NULL;
8725 return 0;
8726 } else
8727 return -1;
8728 }
8729 else if (x == Py_None) {
8730 *result = x;
8731 return 0;
8732 }
8733 else if (PyLong_Check(x)) {
8734 long value = PyLong_AS_LONG(x);
8735 if (value < 0 || value > MAX_UNICODE) {
8736 PyErr_Format(PyExc_ValueError,
8737 "character mapping must be in range(0x%x)",
8738 MAX_UNICODE+1);
8739 Py_DECREF(x);
8740 return -1;
8741 }
8742 *result = x;
8743 return 0;
8744 }
8745 else if (PyUnicode_Check(x)) {
8746 *result = x;
8747 return 0;
8748 }
8749 else {
8750 /* wrong return value */
8751 PyErr_SetString(PyExc_TypeError,
8752 "character mapping must return integer, None or str");
8753 Py_DECREF(x);
8754 return -1;
8755 }
8756}
8757
8758/* lookup the character, write the result into the writer.
8759 Return 1 if the result was written into the writer, return 0 if the mapping
8760 was undefined, raise an exception return -1 on error. */
8761static int
8762charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8763 _PyUnicodeWriter *writer)
8764{
8765 PyObject *item;
8766
8767 if (charmaptranslate_lookup(ch, mapping, &item))
8768 return -1;
8769
8770 if (item == NULL) {
8771 /* not found => default to 1:1 mapping */
8772 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8773 return -1;
8774 }
8775 return 1;
8776 }
8777
8778 if (item == Py_None) {
8779 Py_DECREF(item);
8780 return 0;
8781 }
8782
8783 if (PyLong_Check(item)) {
8784 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8785 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8786 used it */
8787 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8788 Py_DECREF(item);
8789 return -1;
8790 }
8791 Py_DECREF(item);
8792 return 1;
8793 }
8794
8795 if (!PyUnicode_Check(item)) {
8796 Py_DECREF(item);
8797 return -1;
8798 }
8799
8800 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8801 Py_DECREF(item);
8802 return -1;
8803 }
8804
8805 Py_DECREF(item);
8806 return 1;
8807}
8808
8809static int
8810unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8811 Py_UCS1 *translate)
8812{
8813 PyObject *item = NULL;
8814 int ret = 0;
8815
8816 if (charmaptranslate_lookup(ch, mapping, &item)) {
8817 return -1;
8818 }
8819
8820 if (item == Py_None) {
8821 /* deletion */
8822 translate[ch] = 0xfe;
8823 }
8824 else if (item == NULL) {
8825 /* not found => default to 1:1 mapping */
8826 translate[ch] = ch;
8827 return 1;
8828 }
8829 else if (PyLong_Check(item)) {
8830 long replace = PyLong_AS_LONG(item);
8831 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8832 used it */
8833 if (127 < replace) {
8834 /* invalid character or character outside ASCII:
8835 skip the fast translate */
8836 goto exit;
8837 }
8838 translate[ch] = (Py_UCS1)replace;
8839 }
8840 else if (PyUnicode_Check(item)) {
8841 Py_UCS4 replace;
8842
8843 if (PyUnicode_READY(item) == -1) {
8844 Py_DECREF(item);
8845 return -1;
8846 }
8847 if (PyUnicode_GET_LENGTH(item) != 1)
8848 goto exit;
8849
8850 replace = PyUnicode_READ_CHAR(item, 0);
8851 if (replace > 127)
8852 goto exit;
8853 translate[ch] = (Py_UCS1)replace;
8854 }
8855 else {
8856 /* not None, NULL, long or unicode */
8857 goto exit;
8858 }
8859 ret = 1;
8860
8861 exit:
8862 Py_DECREF(item);
8863 return ret;
8864}
8865
8866/* Fast path for ascii => ascii translation. Return 1 if the whole string
8867 was translated into writer, return 0 if the input string was partially
8868 translated into writer, raise an exception and return -1 on error. */
8869static int
8870unicode_fast_translate(PyObject *input, PyObject *mapping,
8871 _PyUnicodeWriter *writer, int ignore,
8872 Py_ssize_t *input_pos)
8873{
8874 Py_UCS1 ascii_table[128], ch, ch2;
8875 Py_ssize_t len;
8876 Py_UCS1 *in, *end, *out;
8877 int res = 0;
8878
8879 len = PyUnicode_GET_LENGTH(input);
8880
8881 memset(ascii_table, 0xff, 128);
8882
8883 in = PyUnicode_1BYTE_DATA(input);
8884 end = in + len;
8885
8886 assert(PyUnicode_IS_ASCII(writer->buffer));
8887 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8888 out = PyUnicode_1BYTE_DATA(writer->buffer);
8889
8890 for (; in < end; in++) {
8891 ch = *in;
8892 ch2 = ascii_table[ch];
8893 if (ch2 == 0xff) {
8894 int translate = unicode_fast_translate_lookup(mapping, ch,
8895 ascii_table);
8896 if (translate < 0)
8897 return -1;
8898 if (translate == 0)
8899 goto exit;
8900 ch2 = ascii_table[ch];
8901 }
8902 if (ch2 == 0xfe) {
8903 if (ignore)
8904 continue;
8905 goto exit;
8906 }
8907 assert(ch2 < 128);
8908 *out = ch2;
8909 out++;
8910 }
8911 res = 1;
8912
8913exit:
8914 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8915 *input_pos = in - PyUnicode_1BYTE_DATA(input);
8916 return res;
8917}
8918
8919static PyObject *
8920_PyUnicode_TranslateCharmap(PyObject *input,
8921 PyObject *mapping,
8922 const char *errors)
8923{
8924 /* input object */
8925 char *data;
8926 Py_ssize_t size, i;
8927 int kind;
8928 /* output buffer */
8929 _PyUnicodeWriter writer;
8930 /* error handler */
8931 char *reason = "character maps to <undefined>";
8932 PyObject *errorHandler = NULL;
8933 PyObject *exc = NULL;
8934 int ignore;
8935 int res;
8936
8937 if (mapping == NULL) {
8938 PyErr_BadArgument();
8939 return NULL;
8940 }
8941
8942 if (PyUnicode_READY(input) == -1)
8943 return NULL;
8944 data = (char*)PyUnicode_DATA(input);
8945 kind = PyUnicode_KIND(input);
8946 size = PyUnicode_GET_LENGTH(input);
8947
8948 if (size == 0)
8949 return PyUnicode_FromObject(input);
8950
8951 /* allocate enough for a simple 1:1 translation without
8952 replacements, if we need more, we'll resize */
8953 _PyUnicodeWriter_Init(&writer);
8954 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8955 goto onError;
8956
8957 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8958
8959 if (PyUnicode_READY(input) == -1)
8960 return NULL;
8961 if (PyUnicode_IS_ASCII(input)) {
8962 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8963 if (res < 0) {
8964 _PyUnicodeWriter_Dealloc(&writer);
8965 return NULL;
8966 }
8967 if (res == 1)
8968 return _PyUnicodeWriter_Finish(&writer);
8969 }
8970 else {
8971 i = 0;
8972 }
8973
8974 while (i<size) {
8975 /* try to encode it */
8976 int translate;
8977 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8978 Py_ssize_t newpos;
8979 /* startpos for collecting untranslatable chars */
8980 Py_ssize_t collstart;
8981 Py_ssize_t collend;
8982 Py_UCS4 ch;
8983
8984 ch = PyUnicode_READ(kind, data, i);
8985 translate = charmaptranslate_output(ch, mapping, &writer);
8986 if (translate < 0)
8987 goto onError;
8988
8989 if (translate != 0) {
8990 /* it worked => adjust input pointer */
8991 ++i;
8992 continue;
8993 }
8994
8995 /* untranslatable character */
8996 collstart = i;
8997 collend = i+1;
8998
8999 /* find all untranslatable characters */
9000 while (collend < size) {
9001 PyObject *x;
9002 ch = PyUnicode_READ(kind, data, collend);
9003 if (charmaptranslate_lookup(ch, mapping, &x))
9004 goto onError;
9005 Py_XDECREF(x);
9006 if (x != Py_None)
9007 break;
9008 ++collend;
9009 }
9010
9011 if (ignore) {
9012 i = collend;
9013 }
9014 else {
9015 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9016 reason, input, &exc,
9017 collstart, collend, &newpos);
9018 if (repunicode == NULL)
9019 goto onError;
9020 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9021 Py_DECREF(repunicode);
9022 goto onError;
9023 }
9024 Py_DECREF(repunicode);
9025 i = newpos;
9026 }
9027 }
9028 Py_XDECREF(exc);
9029 Py_XDECREF(errorHandler);
9030 return _PyUnicodeWriter_Finish(&writer);
9031
9032 onError:
9033 _PyUnicodeWriter_Dealloc(&writer);
9034 Py_XDECREF(exc);
9035 Py_XDECREF(errorHandler);
9036 return NULL;
9037}
9038
9039/* Deprecated. Use PyUnicode_Translate instead. */
9040PyObject *
9041PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9042 Py_ssize_t size,
9043 PyObject *mapping,
9044 const char *errors)
9045{
9046 PyObject *result;
9047 PyObject *unicode = PyUnicode_FromWideChar(p, size);
9048 if (!unicode)
9049 return NULL;
9050 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9051 Py_DECREF(unicode);
9052 return result;
9053}
9054
9055PyObject *
9056PyUnicode_Translate(PyObject *str,
9057 PyObject *mapping,
9058 const char *errors)
9059{
9060 if (ensure_unicode(str) < 0)
9061 return NULL;
9062 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9063}
9064
9065static Py_UCS4
9066fix_decimal_and_space_to_ascii(PyObject *self)
9067{
9068 /* No need to call PyUnicode_READY(self) because this function is only
9069 called as a callback from fixup() which does it already. */
9070 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9071 const int kind = PyUnicode_KIND(self);
9072 void *data = PyUnicode_DATA(self);
9073 Py_UCS4 maxchar = 127, ch, fixed;
9074 int modified = 0;
9075 Py_ssize_t i;
9076
9077 for (i = 0; i < len; ++i) {
9078 ch = PyUnicode_READ(kind, data, i);
9079 fixed = 0;
9080 if (ch > 127) {
9081 if (Py_UNICODE_ISSPACE(ch))
9082 fixed = ' ';
9083 else {
9084 const int decimal = Py_UNICODE_TODECIMAL(ch);
9085 if (decimal >= 0)
9086 fixed = '0' + decimal;
9087 }
9088 if (fixed != 0) {
9089 modified = 1;
9090 maxchar = Py_MAX(maxchar, fixed);
9091 PyUnicode_WRITE(kind, data, i, fixed);
9092 }
9093 else
9094 maxchar = Py_MAX(maxchar, ch);
9095 }
9096 }
9097
9098 return (modified) ? maxchar : 0;
9099}
9100
9101PyObject *
9102_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9103{
9104 if (!PyUnicode_Check(unicode)) {
9105 PyErr_BadInternalCall();
9106 return NULL;
9107 }
9108 if (PyUnicode_READY(unicode) == -1)
9109 return NULL;
9110 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9111 /* If the string is already ASCII, just return the same string */
9112 Py_INCREF(unicode);
9113 return unicode;
9114 }
9115 return fixup(unicode, fix_decimal_and_space_to_ascii);
9116}
9117
9118PyObject *
9119PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9120 Py_ssize_t length)
9121{
9122 PyObject *decimal;
9123 Py_ssize_t i;
9124 Py_UCS4 maxchar;
9125 enum PyUnicode_Kind kind;
9126 void *data;
9127
9128 maxchar = 127;
9129 for (i = 0; i < length; i++) {
9130 Py_UCS4 ch = s[i];
9131 if (ch > 127) {
9132 int decimal = Py_UNICODE_TODECIMAL(ch);
9133 if (decimal >= 0)
9134 ch = '0' + decimal;
9135 maxchar = Py_MAX(maxchar, ch);
9136 }
9137 }
9138
9139 /* Copy to a new string */
9140 decimal = PyUnicode_New(length, maxchar);
9141 if (decimal == NULL)
9142 return decimal;
9143 kind = PyUnicode_KIND(decimal);
9144 data = PyUnicode_DATA(decimal);
9145 /* Iterate over code points */
9146 for (i = 0; i < length; i++) {
9147 Py_UCS4 ch = s[i];
9148 if (ch > 127) {
9149 int decimal = Py_UNICODE_TODECIMAL(ch);
9150 if (decimal >= 0)
9151 ch = '0' + decimal;
9152 }
9153 PyUnicode_WRITE(kind, data, i, ch);
9154 }
9155 return unicode_result(decimal);
9156}
9157/* --- Decimal Encoder ---------------------------------------------------- */
9158
9159int
9160PyUnicode_EncodeDecimal(Py_UNICODE *s,
9161 Py_ssize_t length,
9162 char *output,
9163 const char *errors)
9164{
9165 PyObject *unicode;
9166 Py_ssize_t i;
9167 enum PyUnicode_Kind kind;
9168 void *data;
9169
9170 if (output == NULL) {
9171 PyErr_BadArgument();
9172 return -1;
9173 }
9174
9175 unicode = PyUnicode_FromWideChar(s, length);
9176 if (unicode == NULL)
9177 return -1;
9178
9179 kind = PyUnicode_KIND(unicode);
9180 data = PyUnicode_DATA(unicode);
9181
9182 for (i=0; i < length; ) {
9183 PyObject *exc;
9184 Py_UCS4 ch;
9185 int decimal;
9186 Py_ssize_t startpos;
9187
9188 ch = PyUnicode_READ(kind, data, i);
9189
9190 if (Py_UNICODE_ISSPACE(ch)) {
9191 *output++ = ' ';
9192 i++;
9193 continue;
9194 }
9195 decimal = Py_UNICODE_TODECIMAL(ch);
9196 if (decimal >= 0) {
9197 *output++ = '0' + decimal;
9198 i++;
9199 continue;
9200 }
9201 if (0 < ch && ch < 256) {
9202 *output++ = (char)ch;
9203 i++;
9204 continue;
9205 }
9206
9207 startpos = i;
9208 exc = NULL;
9209 raise_encode_exception(&exc, "decimal", unicode,
9210 startpos, startpos+1,
9211 "invalid decimal Unicode string");
9212 Py_XDECREF(exc);
9213 Py_DECREF(unicode);
9214 return -1;
9215 }
9216 /* 0-terminate the output string */
9217 *output++ = '\0';
9218 Py_DECREF(unicode);
9219 return 0;
9220}
9221
9222/* --- Helpers ------------------------------------------------------------ */
9223
9224/* helper macro to fixup start/end slice values */
9225#define ADJUST_INDICES(start, end, len) \
9226 if (end > len) \
9227 end = len; \
9228 else if (end < 0) { \
9229 end += len; \
9230 if (end < 0) \
9231 end = 0; \
9232 } \
9233 if (start < 0) { \
9234 start += len; \
9235 if (start < 0) \
9236 start = 0; \
9237 }
9238
9239static Py_ssize_t
9240any_find_slice(PyObject* s1, PyObject* s2,
9241 Py_ssize_t start,
9242 Py_ssize_t end,
9243 int direction)
9244{
9245 int kind1, kind2;
9246 void *buf1, *buf2;
9247 Py_ssize_t len1, len2, result;
9248
9249 kind1 = PyUnicode_KIND(s1);
9250 kind2 = PyUnicode_KIND(s2);
9251 if (kind1 < kind2)
9252 return -1;
9253
9254 len1 = PyUnicode_GET_LENGTH(s1);
9255 len2 = PyUnicode_GET_LENGTH(s2);
9256 ADJUST_INDICES(start, end, len1);
9257 if (end - start < len2)
9258 return -1;
9259
9260 buf1 = PyUnicode_DATA(s1);
9261 buf2 = PyUnicode_DATA(s2);
9262 if (len2 == 1) {
9263 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9264 result = findchar((const char *)buf1 + kind1*start,
9265 kind1, end - start, ch, direction);
9266 if (result == -1)
9267 return -1;
9268 else
9269 return start + result;
9270 }
9271
9272 if (kind2 != kind1) {
9273 buf2 = _PyUnicode_AsKind(s2, kind1);
9274 if (!buf2)
9275 return -2;
9276 }
9277
9278 if (direction > 0) {
9279 switch (kind1) {
9280 case PyUnicode_1BYTE_KIND:
9281 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9283 else
9284 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_2BYTE_KIND:
9287 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_4BYTE_KIND:
9290 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 default:
9293 assert(0); result = -2;
9294 }
9295 }
9296 else {
9297 switch (kind1) {
9298 case PyUnicode_1BYTE_KIND:
9299 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9300 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9301 else
9302 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303 break;
9304 case PyUnicode_2BYTE_KIND:
9305 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 break;
9307 case PyUnicode_4BYTE_KIND:
9308 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 default:
9311 assert(0); result = -2;
9312 }
9313 }
9314
9315 if (kind2 != kind1)
9316 PyMem_Free(buf2);
9317
9318 return result;
9319}
9320
9321Py_ssize_t
9322_PyUnicode_InsertThousandsGrouping(
9323 PyObject *unicode, Py_ssize_t index,
9324 Py_ssize_t n_buffer,
9325 void *digits, Py_ssize_t n_digits,
9326 Py_ssize_t min_width,
9327 const char *grouping, PyObject *thousands_sep,
9328 Py_UCS4 *maxchar)
9329{
9330 unsigned int kind, thousands_sep_kind;
9331 char *data, *thousands_sep_data;
9332 Py_ssize_t thousands_sep_len;
9333 Py_ssize_t len;
9334
9335 if (unicode != NULL) {
9336 kind = PyUnicode_KIND(unicode);
9337 data = (char *) PyUnicode_DATA(unicode) + index * kind;
9338 }
9339 else {
9340 kind = PyUnicode_1BYTE_KIND;
9341 data = NULL;
9342 }
9343 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9344 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9345 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9346 if (unicode != NULL && thousands_sep_kind != kind) {
9347 if (thousands_sep_kind < kind) {
9348 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9349 if (!thousands_sep_data)
9350 return -1;
9351 }
9352 else {
9353 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9354 if (!data)
9355 return -1;
9356 }
9357 }
9358
9359 switch (kind) {
9360 case PyUnicode_1BYTE_KIND:
9361 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9362 len = asciilib_InsertThousandsGrouping(
9363 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
9364 min_width, grouping,
9365 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9366 else
9367 len = ucs1lib_InsertThousandsGrouping(
9368 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9369 min_width, grouping,
9370 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9371 break;
9372 case PyUnicode_2BYTE_KIND:
9373 len = ucs2lib_InsertThousandsGrouping(
9374 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9375 min_width, grouping,
9376 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9377 break;
9378 case PyUnicode_4BYTE_KIND:
9379 len = ucs4lib_InsertThousandsGrouping(
9380 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9381 min_width, grouping,
9382 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9383 break;
9384 default:
9385 assert(0);
9386 return -1;
9387 }
9388 if (unicode != NULL && thousands_sep_kind != kind) {
9389 if (thousands_sep_kind < kind)
9390 PyMem_Free(thousands_sep_data);
9391 else
9392 PyMem_Free(data);
9393 }
9394 if (unicode == NULL) {
9395 *maxchar = 127;
9396 if (len != n_digits) {
9397 *maxchar = Py_MAX(*maxchar,
9398 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9399 }
9400 }
9401 return len;
9402}
9403
9404
9405Py_ssize_t
9406PyUnicode_Count(PyObject *str,
9407 PyObject *substr,
9408 Py_ssize_t start,
9409 Py_ssize_t end)
9410{
9411 Py_ssize_t result;
9412 int kind1, kind2;
9413 void *buf1 = NULL, *buf2 = NULL;
9414 Py_ssize_t len1, len2;
9415
9416 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9417 return -1;
9418
9419 kind1 = PyUnicode_KIND(str);
9420 kind2 = PyUnicode_KIND(substr);
9421 if (kind1 < kind2)
9422 return 0;
9423
9424 len1 = PyUnicode_GET_LENGTH(str);
9425 len2 = PyUnicode_GET_LENGTH(substr);
9426 ADJUST_INDICES(start, end, len1);
9427 if (end - start < len2)
9428 return 0;
9429
9430 buf1 = PyUnicode_DATA(str);
9431 buf2 = PyUnicode_DATA(substr);
9432 if (kind2 != kind1) {
9433 buf2 = _PyUnicode_AsKind(substr, kind1);
9434 if (!buf2)
9435 goto onError;
9436 }
9437
9438 switch (kind1) {
9439 case PyUnicode_1BYTE_KIND:
9440 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9441 result = asciilib_count(
9442 ((Py_UCS1*)buf1) + start, end - start,
9443 buf2, len2, PY_SSIZE_T_MAX
9444 );
9445 else
9446 result = ucs1lib_count(
9447 ((Py_UCS1*)buf1) + start, end - start,
9448 buf2, len2, PY_SSIZE_T_MAX
9449 );
9450 break;
9451 case PyUnicode_2BYTE_KIND:
9452 result = ucs2lib_count(
9453 ((Py_UCS2*)buf1) + start, end - start,
9454 buf2, len2, PY_SSIZE_T_MAX
9455 );
9456 break;
9457 case PyUnicode_4BYTE_KIND:
9458 result = ucs4lib_count(
9459 ((Py_UCS4*)buf1) + start, end - start,
9460 buf2, len2, PY_SSIZE_T_MAX
9461 );
9462 break;
9463 default:
9464 assert(0); result = 0;
9465 }
9466
9467 if (kind2 != kind1)
9468 PyMem_Free(buf2);
9469
9470 return result;
9471 onError:
9472 if (kind2 != kind1 && buf2)
9473 PyMem_Free(buf2);
9474 return -1;
9475}
9476
9477Py_ssize_t
9478PyUnicode_Find(PyObject *str,
9479 PyObject *substr,
9480 Py_ssize_t start,
9481 Py_ssize_t end,
9482 int direction)
9483{
9484 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9485 return -2;
9486
9487 return any_find_slice(str, substr, start, end, direction);
9488}
9489
9490Py_ssize_t
9491PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9492 Py_ssize_t start, Py_ssize_t end,
9493 int direction)
9494{
9495 int kind;
9496 Py_ssize_t len, result;
9497 if (PyUnicode_READY(str) == -1)
9498 return -2;
9499 len = PyUnicode_GET_LENGTH(str);
9500 ADJUST_INDICES(start, end, len);
9501 if (end - start < 1)
9502 return -1;
9503 kind = PyUnicode_KIND(str);
9504 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9505 kind, end-start, ch, direction);
9506 if (result == -1)
9507 return -1;
9508 else
9509 return start + result;
9510}
9511
9512static int
9513tailmatch(PyObject *self,
9514 PyObject *substring,
9515 Py_ssize_t start,
9516 Py_ssize_t end,
9517 int direction)
9518{
9519 int kind_self;
9520 int kind_sub;
9521 void *data_self;
9522 void *data_sub;
9523 Py_ssize_t offset;
9524 Py_ssize_t i;
9525 Py_ssize_t end_sub;
9526
9527 if (PyUnicode_READY(self) == -1 ||
9528 PyUnicode_READY(substring) == -1)
9529 return -1;
9530
9531 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9532 end -= PyUnicode_GET_LENGTH(substring);
9533 if (end < start)
9534 return 0;
9535
9536 if (PyUnicode_GET_LENGTH(substring) == 0)
9537 return 1;
9538
9539 kind_self = PyUnicode_KIND(self);
9540 data_self = PyUnicode_DATA(self);
9541 kind_sub = PyUnicode_KIND(substring);
9542 data_sub = PyUnicode_DATA(substring);
9543 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9544
9545 if (direction > 0)
9546 offset = end;
9547 else
9548 offset = start;
9549
9550 if (PyUnicode_READ(kind_self, data_self, offset) ==
9551 PyUnicode_READ(kind_sub, data_sub, 0) &&
9552 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9553 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9554 /* If both are of the same kind, memcmp is sufficient */
9555 if (kind_self == kind_sub) {
9556 return ! memcmp((char *)data_self +
9557 (offset * PyUnicode_KIND(substring)),
9558 data_sub,
9559 PyUnicode_GET_LENGTH(substring) *
9560 PyUnicode_KIND(substring));
9561 }
9562 /* otherwise we have to compare each character by first accessing it */
9563 else {
9564 /* We do not need to compare 0 and len(substring)-1 because
9565 the if statement above ensured already that they are equal
9566 when we end up here. */
9567 for (i = 1; i < end_sub; ++i) {
9568 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9569 PyUnicode_READ(kind_sub, data_sub, i))
9570 return 0;
9571 }
9572 return 1;
9573 }
9574 }
9575
9576 return 0;
9577}
9578
9579Py_ssize_t
9580PyUnicode_Tailmatch(PyObject *str,
9581 PyObject *substr,
9582 Py_ssize_t start,
9583 Py_ssize_t end,
9584 int direction)
9585{
9586 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9587 return -1;
9588
9589 return tailmatch(str, substr, start, end, direction);
9590}
9591
9592/* Apply fixfct filter to the Unicode object self and return a
9593 reference to the modified object */
9594
9595static PyObject *
9596fixup(PyObject *self,
9597 Py_UCS4 (*fixfct)(PyObject *s))
9598{
9599 PyObject *u;
9600 Py_UCS4 maxchar_old, maxchar_new = 0;
9601 PyObject *v;
9602
9603 u = _PyUnicode_Copy(self);
9604 if (u == NULL)
9605 return NULL;
9606 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9607
9608 /* fix functions return the new maximum character in a string,
9609 if the kind of the resulting unicode object does not change,
9610 everything is fine. Otherwise we need to change the string kind
9611 and re-run the fix function. */
9612 maxchar_new = fixfct(u);
9613
9614 if (maxchar_new == 0) {
9615 /* no changes */;
9616 if (PyUnicode_CheckExact(self)) {
9617 Py_DECREF(u);
9618 Py_INCREF(self);
9619 return self;
9620 }
9621 else
9622 return u;
9623 }
9624
9625 maxchar_new = align_maxchar(maxchar_new);
9626
9627 if (maxchar_new == maxchar_old)
9628 return u;
9629
9630 /* In case the maximum character changed, we need to
9631 convert the string to the new category. */
9632 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9633 if (v == NULL) {
9634 Py_DECREF(u);
9635 return NULL;
9636 }
9637 if (maxchar_new > maxchar_old) {
9638 /* If the maxchar increased so that the kind changed, not all
9639 characters are representable anymore and we need to fix the
9640 string again. This only happens in very few cases. */
9641 _PyUnicode_FastCopyCharacters(v, 0,
9642 self, 0, PyUnicode_GET_LENGTH(self));
9643 maxchar_old = fixfct(v);
9644 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9645 }
9646 else {
9647 _PyUnicode_FastCopyCharacters(v, 0,
9648 u, 0, PyUnicode_GET_LENGTH(self));
9649 }
9650 Py_DECREF(u);
9651 assert(_PyUnicode_CheckConsistency(v, 1));
9652 return v;
9653}
9654
9655static PyObject *
9656ascii_upper_or_lower(PyObject *self, int lower)
9657{
9658 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9659 char *resdata, *data = PyUnicode_DATA(self);
9660 PyObject *res;
9661
9662 res = PyUnicode_New(len, 127);
9663 if (res == NULL)
9664 return NULL;
9665 resdata = PyUnicode_DATA(res);
9666 if (lower)
9667 _Py_bytes_lower(resdata, data, len);
9668 else
9669 _Py_bytes_upper(resdata, data, len);
9670 return res;
9671}
9672
9673static Py_UCS4
9674handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9675{
9676 Py_ssize_t j;
9677 int final_sigma;
9678 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9679 /* U+03A3 is in the Final_Sigma context when, it is found like this:
9680
9681 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9682
9683 where ! is a negation and \p{xxx} is a character with property xxx.
9684 */
9685 for (j = i - 1; j >= 0; j--) {
9686 c = PyUnicode_READ(kind, data, j);
9687 if (!_PyUnicode_IsCaseIgnorable(c))
9688 break;
9689 }
9690 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9691 if (final_sigma) {
9692 for (j = i + 1; j < length; j++) {
9693 c = PyUnicode_READ(kind, data, j);
9694 if (!_PyUnicode_IsCaseIgnorable(c))
9695 break;
9696 }
9697 final_sigma = j == length || !_PyUnicode_IsCased(c);
9698 }
9699 return (final_sigma) ? 0x3C2 : 0x3C3;
9700}
9701
9702static int
9703lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9704 Py_UCS4 c, Py_UCS4 *mapped)
9705{
9706 /* Obscure special case. */
9707 if (c == 0x3A3) {
9708 mapped[0] = handle_capital_sigma(kind, data, length, i);
9709 return 1;
9710 }
9711 return _PyUnicode_ToLowerFull(c, mapped);
9712}
9713
9714static Py_ssize_t
9715do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9716{
9717 Py_ssize_t i, k = 0;
9718 int n_res, j;
9719 Py_UCS4 c, mapped[3];
9720
9721 c = PyUnicode_READ(kind, data, 0);
9722 n_res = _PyUnicode_ToUpperFull(c, mapped);
9723 for (j = 0; j < n_res; j++) {
9724 *maxchar = Py_MAX(*maxchar, mapped[j]);
9725 res[k++] = mapped[j];
9726 }
9727 for (i = 1; i < length; i++) {
9728 c = PyUnicode_READ(kind, data, i);
9729 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9730 for (j = 0; j < n_res; j++) {
9731 *maxchar = Py_MAX(*maxchar, mapped[j]);
9732 res[k++] = mapped[j];
9733 }
9734 }
9735 return k;
9736}
9737
9738static Py_ssize_t
9739do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9740 Py_ssize_t i, k = 0;
9741
9742 for (i = 0; i < length; i++) {
9743 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9744 int n_res, j;
9745 if (Py_UNICODE_ISUPPER(c)) {
9746 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9747 }
9748 else if (Py_UNICODE_ISLOWER(c)) {
9749 n_res = _PyUnicode_ToUpperFull(c, mapped);
9750 }
9751 else {
9752 n_res = 1;
9753 mapped[0] = c;
9754 }
9755 for (j = 0; j < n_res; j++) {
9756 *maxchar = Py_MAX(*maxchar, mapped[j]);
9757 res[k++] = mapped[j];
9758 }
9759 }
9760 return k;
9761}
9762
9763static Py_ssize_t
9764do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9765 Py_UCS4 *maxchar, int lower)
9766{
9767 Py_ssize_t i, k = 0;
9768
9769 for (i = 0; i < length; i++) {
9770 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9771 int n_res, j;
9772 if (lower)
9773 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9774 else
9775 n_res = _PyUnicode_ToUpperFull(c, mapped);
9776 for (j = 0; j < n_res; j++) {
9777 *maxchar = Py_MAX(*maxchar, mapped[j]);
9778 res[k++] = mapped[j];
9779 }
9780 }
9781 return k;
9782}
9783
9784static Py_ssize_t
9785do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9786{
9787 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9788}
9789
9790static Py_ssize_t
9791do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9792{
9793 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9794}
9795
9796static Py_ssize_t
9797do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9798{
9799 Py_ssize_t i, k = 0;
9800
9801 for (i = 0; i < length; i++) {
9802 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9803 Py_UCS4 mapped[3];
9804 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9805 for (j = 0; j < n_res; j++) {
9806 *maxchar = Py_MAX(*maxchar, mapped[j]);
9807 res[k++] = mapped[j];
9808 }
9809 }
9810 return k;
9811}
9812
9813static Py_ssize_t
9814do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9815{
9816 Py_ssize_t i, k = 0;
9817 int previous_is_cased;
9818
9819 previous_is_cased = 0;
9820 for (i = 0; i < length; i++) {
9821 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9822 Py_UCS4 mapped[3];
9823 int n_res, j;
9824
9825 if (previous_is_cased)
9826 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9827 else
9828 n_res = _PyUnicode_ToTitleFull(c, mapped);
9829
9830 for (j = 0; j < n_res; j++) {
9831 *maxchar = Py_MAX(*maxchar, mapped[j]);
9832 res[k++] = mapped[j];
9833 }
9834
9835 previous_is_cased = _PyUnicode_IsCased(c);
9836 }
9837 return k;
9838}
9839
9840static PyObject *
9841case_operation(PyObject *self,
9842 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9843{
9844 PyObject *res = NULL;
9845 Py_ssize_t length, newlength = 0;
9846 int kind, outkind;
9847 void *data, *outdata;
9848 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9849
9850 assert(PyUnicode_IS_READY(self));
9851
9852 kind = PyUnicode_KIND(self);
9853 data = PyUnicode_DATA(self);
9854 length = PyUnicode_GET_LENGTH(self);
9855 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9856 PyErr_SetString(PyExc_OverflowError, "string is too long");
9857 return NULL;
9858 }
9859 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9860 if (tmp == NULL)
9861 return PyErr_NoMemory();
9862 newlength = perform(kind, data, length, tmp, &maxchar);
9863 res = PyUnicode_New(newlength, maxchar);
9864 if (res == NULL)
9865 goto leave;
9866 tmpend = tmp + newlength;
9867 outdata = PyUnicode_DATA(res);
9868 outkind = PyUnicode_KIND(res);
9869 switch (outkind) {
9870 case PyUnicode_1BYTE_KIND:
9871 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9872 break;
9873 case PyUnicode_2BYTE_KIND:
9874 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9875 break;
9876 case PyUnicode_4BYTE_KIND:
9877 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9878 break;
9879 default:
9880 assert(0);
9881 break;
9882 }
9883 leave:
9884 PyMem_FREE(tmp);
9885 return res;
9886}
9887
9888PyObject *
9889PyUnicode_Join(PyObject *separator, PyObject *seq)
9890{
9891 PyObject *res;
9892 PyObject *fseq;
9893 Py_ssize_t seqlen;
9894 PyObject **items;
9895
9896 fseq = PySequence_Fast(seq, "can only join an iterable");
9897 if (fseq == NULL) {
9898 return NULL;
9899 }
9900
9901 /* NOTE: the following code can't call back into Python code,
9902 * so we are sure that fseq won't be mutated.
9903 */
9904
9905 items = PySequence_Fast_ITEMS(fseq);
9906 seqlen = PySequence_Fast_GET_SIZE(fseq);
9907 res = _PyUnicode_JoinArray(separator, items, seqlen);
9908 Py_DECREF(fseq);
9909 return res;
9910}
9911
9912PyObject *
9913_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9914{
9915 PyObject *res = NULL; /* the result */
9916 PyObject *sep = NULL;
9917 Py_ssize_t seplen;
9918 PyObject *item;
9919 Py_ssize_t sz, i, res_offset;
9920 Py_UCS4 maxchar;
9921 Py_UCS4 item_maxchar;
9922 int use_memcpy;
9923 unsigned char *res_data = NULL, *sep_data = NULL;
9924 PyObject *last_obj;
9925 unsigned int kind = 0;
9926
9927 /* If empty sequence, return u"". */
9928 if (seqlen == 0) {
9929 _Py_RETURN_UNICODE_EMPTY();
9930 }
9931
9932 /* If singleton sequence with an exact Unicode, return that. */
9933 last_obj = NULL;
9934 if (seqlen == 1) {
9935 if (PyUnicode_CheckExact(items[0])) {
9936 res = items[0];
9937 Py_INCREF(res);
9938 return res;
9939 }
9940 seplen = 0;
9941 maxchar = 0;
9942 }
9943 else {
9944 /* Set up sep and seplen */
9945 if (separator == NULL) {
9946 /* fall back to a blank space separator */
9947 sep = PyUnicode_FromOrdinal(' ');
9948 if (!sep)
9949 goto onError;
9950 seplen = 1;
9951 maxchar = 32;
9952 }
9953 else {
9954 if (!PyUnicode_Check(separator)) {
9955 PyErr_Format(PyExc_TypeError,
9956 "separator: expected str instance,"
9957 " %.80s found",
9958 Py_TYPE(separator)->tp_name);
9959 goto onError;
9960 }
9961 if (PyUnicode_READY(separator))
9962 goto onError;
9963 sep = separator;
9964 seplen = PyUnicode_GET_LENGTH(separator);
9965 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9966 /* inc refcount to keep this code path symmetric with the
9967 above case of a blank separator */
9968 Py_INCREF(sep);
9969 }
9970 last_obj = sep;
9971 }
9972
9973 /* There are at least two things to join, or else we have a subclass
9974 * of str in the sequence.
9975 * Do a pre-pass to figure out the total amount of space we'll
9976 * need (sz), and see whether all argument are strings.
9977 */
9978 sz = 0;
9979#ifdef Py_DEBUG
9980 use_memcpy = 0;
9981#else
9982 use_memcpy = 1;
9983#endif
9984 for (i = 0; i < seqlen; i++) {
9985 size_t add_sz;
9986 item = items[i];
9987 if (!PyUnicode_Check(item)) {
9988 PyErr_Format(PyExc_TypeError,
9989 "sequence item %zd: expected str instance,"
9990 " %.80s found",
9991 i, Py_TYPE(item)->tp_name);
9992 goto onError;
9993 }
9994 if (PyUnicode_READY(item) == -1)
9995 goto onError;
9996 add_sz = PyUnicode_GET_LENGTH(item);
9997 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9998 maxchar = Py_MAX(maxchar, item_maxchar);
9999 if (i != 0) {
10000 add_sz += seplen;
10001 }
10002 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10003 PyErr_SetString(PyExc_OverflowError,
10004 "join() result is too long for a Python string");
10005 goto onError;
10006 }
10007 sz += add_sz;
10008 if (use_memcpy && last_obj != NULL) {
10009 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10010 use_memcpy = 0;
10011 }
10012 last_obj = item;
10013 }
10014
10015 res = PyUnicode_New(sz, maxchar);
10016 if (res == NULL)
10017 goto onError;
10018
10019 /* Catenate everything. */
10020#ifdef Py_DEBUG
10021 use_memcpy = 0;
10022#else
10023 if (use_memcpy) {
10024 res_data = PyUnicode_1BYTE_DATA(res);
10025 kind = PyUnicode_KIND(res);
10026 if (seplen != 0)
10027 sep_data = PyUnicode_1BYTE_DATA(sep);
10028 }
10029#endif
10030 if (use_memcpy) {
10031 for (i = 0; i < seqlen; ++i) {
10032 Py_ssize_t itemlen;
10033 item = items[i];
10034
10035 /* Copy item, and maybe the separator. */
10036 if (i && seplen != 0) {
10037 memcpy(res_data,
10038 sep_data,
10039 kind * seplen);
10040 res_data += kind * seplen;
10041 }
10042
10043 itemlen = PyUnicode_GET_LENGTH(item);
10044 if (itemlen != 0) {
10045 memcpy(res_data,
10046 PyUnicode_DATA(item),
10047 kind * itemlen);
10048 res_data += kind * itemlen;
10049 }
10050 }
10051 assert(res_data == PyUnicode_1BYTE_DATA(res)
10052 + kind * PyUnicode_GET_LENGTH(res));
10053 }
10054 else {
10055 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10056 Py_ssize_t itemlen;
10057 item = items[i];
10058
10059 /* Copy item, and maybe the separator. */
10060 if (i && seplen != 0) {
10061 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10062 res_offset += seplen;
10063 }
10064
10065 itemlen = PyUnicode_GET_LENGTH(item);
10066 if (itemlen != 0) {
10067 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10068 res_offset += itemlen;
10069 }
10070 }
10071 assert(res_offset == PyUnicode_GET_LENGTH(res));
10072 }
10073
10074 Py_XDECREF(sep);
10075 assert(_PyUnicode_CheckConsistency(res, 1));
10076 return res;
10077
10078 onError:
10079 Py_XDECREF(sep);
10080 Py_XDECREF(res);
10081 return NULL;
10082}
10083
10084#define FILL(kind, data, value, start, length) \
10085 do { \
10086 Py_ssize_t i_ = 0; \
10087 assert(kind != PyUnicode_WCHAR_KIND); \
10088 switch ((kind)) { \
10089 case PyUnicode_1BYTE_KIND: { \
10090 unsigned char * to_ = (unsigned char *)((data)) + (start); \
10091 memset(to_, (unsigned char)value, (length)); \
10092 break; \
10093 } \
10094 case PyUnicode_2BYTE_KIND: { \
10095 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10096 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10097 break; \
10098 } \
10099 case PyUnicode_4BYTE_KIND: { \
10100 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10101 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10102 break; \
10103 } \
10104 default: assert(0); \
10105 } \
10106 } while (0)
10107
10108void
10109_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10110 Py_UCS4 fill_char)
10111{
10112 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10113 const void *data = PyUnicode_DATA(unicode);
10114 assert(PyUnicode_IS_READY(unicode));
10115 assert(unicode_modifiable(unicode));
10116 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10117 assert(start >= 0);
10118 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10119 FILL(kind, data, fill_char, start, length);
10120}
10121
10122Py_ssize_t
10123PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10124 Py_UCS4 fill_char)
10125{
10126 Py_ssize_t maxlen;
10127
10128 if (!PyUnicode_Check(unicode)) {
10129 PyErr_BadInternalCall();
10130 return -1;
10131 }
10132 if (PyUnicode_READY(unicode) == -1)
10133 return -1;
10134 if (unicode_check_modifiable(unicode))
10135 return -1;
10136
10137 if (start < 0) {
10138 PyErr_SetString(PyExc_IndexError, "string index out of range");
10139 return -1;
10140 }
10141 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10142 PyErr_SetString(PyExc_ValueError,
10143 "fill character is bigger than "
10144 "the string maximum character");
10145 return -1;
10146 }
10147
10148 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10149 length = Py_MIN(maxlen, length);
10150 if (length <= 0)
10151 return 0;
10152
10153 _PyUnicode_FastFill(unicode, start, length, fill_char);
10154 return length;
10155}
10156
10157static PyObject *
10158pad(PyObject *self,
10159 Py_ssize_t left,
10160 Py_ssize_t right,
10161 Py_UCS4 fill)
10162{
10163 PyObject *u;
10164 Py_UCS4 maxchar;
10165 int kind;
10166 void *data;
10167
10168 if (left < 0)
10169 left = 0;
10170 if (right < 0)
10171 right = 0;
10172
10173 if (left == 0 && right == 0)
10174 return unicode_result_unchanged(self);
10175
10176 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10177 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10178 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10179 return NULL;
10180 }
10181 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10182 maxchar = Py_MAX(maxchar, fill);
10183 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10184 if (!u)
10185 return NULL;
10186
10187 kind = PyUnicode_KIND(u);
10188 data = PyUnicode_DATA(u);
10189 if (left)
10190 FILL(kind, data, fill, 0, left);
10191 if (right)
10192 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10193 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10194 assert(_PyUnicode_CheckConsistency(u, 1));
10195 return u;
10196}
10197
10198PyObject *
10199PyUnicode_Splitlines(PyObject *string, int keepends)
10200{
10201 PyObject *list;
10202
10203 if (ensure_unicode(string) < 0)
10204 return NULL;
10205
10206 switch (PyUnicode_KIND(string)) {
10207 case PyUnicode_1BYTE_KIND:
10208 if (PyUnicode_IS_ASCII(string))
10209 list = asciilib_splitlines(
10210 string, PyUnicode_1BYTE_DATA(string),
10211 PyUnicode_GET_LENGTH(string), keepends);
10212 else
10213 list = ucs1lib_splitlines(
10214 string, PyUnicode_1BYTE_DATA(string),
10215 PyUnicode_GET_LENGTH(string), keepends);
10216 break;
10217 case PyUnicode_2BYTE_KIND:
10218 list = ucs2lib_splitlines(
10219 string, PyUnicode_2BYTE_DATA(string),
10220 PyUnicode_GET_LENGTH(string), keepends);
10221 break;
10222 case PyUnicode_4BYTE_KIND:
10223 list = ucs4lib_splitlines(
10224 string, PyUnicode_4BYTE_DATA(string),
10225 PyUnicode_GET_LENGTH(string), keepends);
10226 break;
10227 default:
10228 assert(0);
10229 list = 0;
10230 }
10231 return list;
10232}
10233
10234static PyObject *
10235split(PyObject *self,
10236 PyObject *substring,
10237 Py_ssize_t maxcount)
10238{
10239 int kind1, kind2;
10240 void *buf1, *buf2;
10241 Py_ssize_t len1, len2;
10242 PyObject* out;
10243
10244 if (maxcount < 0)
10245 maxcount = PY_SSIZE_T_MAX;
10246
10247 if (PyUnicode_READY(self) == -1)
10248 return NULL;
10249
10250 if (substring == NULL)
10251 switch (PyUnicode_KIND(self)) {
10252 case PyUnicode_1BYTE_KIND:
10253 if (PyUnicode_IS_ASCII(self))
10254 return asciilib_split_whitespace(
10255 self, PyUnicode_1BYTE_DATA(self),
10256 PyUnicode_GET_LENGTH(self), maxcount
10257 );
10258 else
10259 return ucs1lib_split_whitespace(
10260 self, PyUnicode_1BYTE_DATA(self),
10261 PyUnicode_GET_LENGTH(self), maxcount
10262 );
10263 case PyUnicode_2BYTE_KIND:
10264 return ucs2lib_split_whitespace(
10265 self, PyUnicode_2BYTE_DATA(self),
10266 PyUnicode_GET_LENGTH(self), maxcount
10267 );
10268 case PyUnicode_4BYTE_KIND:
10269 return ucs4lib_split_whitespace(
10270 self, PyUnicode_4BYTE_DATA(self),
10271 PyUnicode_GET_LENGTH(self), maxcount
10272 );
10273 default:
10274 assert(0);
10275 return NULL;
10276 }
10277
10278 if (PyUnicode_READY(substring) == -1)
10279 return NULL;
10280
10281 kind1 = PyUnicode_KIND(self);
10282 kind2 = PyUnicode_KIND(substring);
10283 len1 = PyUnicode_GET_LENGTH(self);
10284 len2 = PyUnicode_GET_LENGTH(substring);
10285 if (kind1 < kind2 || len1 < len2) {
10286 out = PyList_New(1);
10287 if (out == NULL)
10288 return NULL;
10289 Py_INCREF(self);
10290 PyList_SET_ITEM(out, 0, self);
10291 return out;
10292 }
10293 buf1 = PyUnicode_DATA(self);
10294 buf2 = PyUnicode_DATA(substring);
10295 if (kind2 != kind1) {
10296 buf2 = _PyUnicode_AsKind(substring, kind1);
10297 if (!buf2)
10298 return NULL;
10299 }
10300
10301 switch (kind1) {
10302 case PyUnicode_1BYTE_KIND:
10303 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10304 out = asciilib_split(
10305 self, buf1, len1, buf2, len2, maxcount);
10306 else
10307 out = ucs1lib_split(
10308 self, buf1, len1, buf2, len2, maxcount);
10309 break;
10310 case PyUnicode_2BYTE_KIND:
10311 out = ucs2lib_split(
10312 self, buf1, len1, buf2, len2, maxcount);
10313 break;
10314 case PyUnicode_4BYTE_KIND:
10315 out = ucs4lib_split(
10316 self, buf1, len1, buf2, len2, maxcount);
10317 break;
10318 default:
10319 out = NULL;
10320 }
10321 if (kind2 != kind1)
10322 PyMem_Free(buf2);
10323 return out;
10324}
10325
10326static PyObject *
10327rsplit(PyObject *self,
10328 PyObject *substring,
10329 Py_ssize_t maxcount)
10330{
10331 int kind1, kind2;
10332 void *buf1, *buf2;
10333 Py_ssize_t len1, len2;
10334 PyObject* out;
10335
10336 if (maxcount < 0)
10337 maxcount = PY_SSIZE_T_MAX;
10338
10339 if (PyUnicode_READY(self) == -1)
10340 return NULL;
10341
10342 if (substring == NULL)
10343 switch (PyUnicode_KIND(self)) {
10344 case PyUnicode_1BYTE_KIND:
10345 if (PyUnicode_IS_ASCII(self))
10346 return asciilib_rsplit_whitespace(
10347 self, PyUnicode_1BYTE_DATA(self),
10348 PyUnicode_GET_LENGTH(self), maxcount
10349 );
10350 else
10351 return ucs1lib_rsplit_whitespace(
10352 self, PyUnicode_1BYTE_DATA(self),
10353 PyUnicode_GET_LENGTH(self), maxcount
10354 );
10355 case PyUnicode_2BYTE_KIND:
10356 return ucs2lib_rsplit_whitespace(
10357 self, PyUnicode_2BYTE_DATA(self),
10358 PyUnicode_GET_LENGTH(self), maxcount
10359 );
10360 case PyUnicode_4BYTE_KIND:
10361 return ucs4lib_rsplit_whitespace(
10362 self, PyUnicode_4BYTE_DATA(self),
10363 PyUnicode_GET_LENGTH(self), maxcount
10364 );
10365 default:
10366 assert(0);
10367 return NULL;
10368 }
10369
10370 if (PyUnicode_READY(substring) == -1)
10371 return NULL;
10372
10373 kind1 = PyUnicode_KIND(self);
10374 kind2 = PyUnicode_KIND(substring);
10375 len1 = PyUnicode_GET_LENGTH(self);
10376 len2 = PyUnicode_GET_LENGTH(substring);
10377 if (kind1 < kind2 || len1 < len2) {
10378 out = PyList_New(1);
10379 if (out == NULL)
10380 return NULL;
10381 Py_INCREF(self);
10382 PyList_SET_ITEM(out, 0, self);
10383 return out;
10384 }
10385 buf1 = PyUnicode_DATA(self);
10386 buf2 = PyUnicode_DATA(substring);
10387 if (kind2 != kind1) {
10388 buf2 = _PyUnicode_AsKind(substring, kind1);
10389 if (!buf2)
10390 return NULL;
10391 }
10392
10393 switch (kind1) {
10394 case PyUnicode_1BYTE_KIND:
10395 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10396 out = asciilib_rsplit(
10397 self, buf1, len1, buf2, len2, maxcount);
10398 else
10399 out = ucs1lib_rsplit(
10400 self, buf1, len1, buf2, len2, maxcount);
10401 break;
10402 case PyUnicode_2BYTE_KIND:
10403 out = ucs2lib_rsplit(
10404 self, buf1, len1, buf2, len2, maxcount);
10405 break;
10406 case PyUnicode_4BYTE_KIND:
10407 out = ucs4lib_rsplit(
10408 self, buf1, len1, buf2, len2, maxcount);
10409 break;
10410 default:
10411 out = NULL;
10412 }
10413 if (kind2 != kind1)
10414 PyMem_Free(buf2);
10415 return out;
10416}
10417
10418static Py_ssize_t
10419anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10420 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10421{
10422 switch (kind) {
10423 case PyUnicode_1BYTE_KIND:
10424 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10425 return asciilib_find(buf1, len1, buf2, len2, offset);
10426 else
10427 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10428 case PyUnicode_2BYTE_KIND:
10429 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10430 case PyUnicode_4BYTE_KIND:
10431 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10432 }
10433 assert(0);
10434 return -1;
10435}
10436
10437static Py_ssize_t
10438anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10439 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10440{
10441 switch (kind) {
10442 case PyUnicode_1BYTE_KIND:
10443 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10444 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10445 else
10446 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10447 case PyUnicode_2BYTE_KIND:
10448 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10449 case PyUnicode_4BYTE_KIND:
10450 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10451 }
10452 assert(0);
10453 return 0;
10454}
10455
10456static void
10457replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10458 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10459{
10460 int kind = PyUnicode_KIND(u);
10461 void *data = PyUnicode_DATA(u);
10462 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10463 if (kind == PyUnicode_1BYTE_KIND) {
10464 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10465 (Py_UCS1 *)data + len,
10466 u1, u2, maxcount);
10467 }
10468 else if (kind == PyUnicode_2BYTE_KIND) {
10469 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10470 (Py_UCS2 *)data + len,
10471 u1, u2, maxcount);
10472 }
10473 else {
10474 assert(kind == PyUnicode_4BYTE_KIND);
10475 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10476 (Py_UCS4 *)data + len,
10477 u1, u2, maxcount);
10478 }
10479}
10480
10481static PyObject *
10482replace(PyObject *self, PyObject *str1,
10483 PyObject *str2, Py_ssize_t maxcount)
10484{
10485 PyObject *u;
10486 char *sbuf = PyUnicode_DATA(self);
10487 char *buf1 = PyUnicode_DATA(str1);
10488 char *buf2 = PyUnicode_DATA(str2);
10489 int srelease = 0, release1 = 0, release2 = 0;
10490 int skind = PyUnicode_KIND(self);
10491 int kind1 = PyUnicode_KIND(str1);
10492 int kind2 = PyUnicode_KIND(str2);
10493 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10494 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10495 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10496 int mayshrink;
10497 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10498
10499 if (maxcount < 0)
10500 maxcount = PY_SSIZE_T_MAX;
10501 else if (maxcount == 0 || slen == 0)
10502 goto nothing;
10503
10504 if (str1 == str2)
10505 goto nothing;
10506
10507 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10508 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10509 if (maxchar < maxchar_str1)
10510 /* substring too wide to be present */
10511 goto nothing;
10512 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10513 /* Replacing str1 with str2 may cause a maxchar reduction in the
10514 result string. */
10515 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10516 maxchar = Py_MAX(maxchar, maxchar_str2);
10517
10518 if (len1 == len2) {
10519 /* same length */
10520 if (len1 == 0)
10521 goto nothing;
10522 if (len1 == 1) {
10523 /* replace characters */
10524 Py_UCS4 u1, u2;
10525 Py_ssize_t pos;
10526
10527 u1 = PyUnicode_READ(kind1, buf1, 0);
10528 pos = findchar(sbuf, skind, slen, u1, 1);
10529 if (pos < 0)
10530 goto nothing;
10531 u2 = PyUnicode_READ(kind2, buf2, 0);
10532 u = PyUnicode_New(slen, maxchar);
10533 if (!u)
10534 goto error;
10535
10536 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10537 replace_1char_inplace(u, pos, u1, u2, maxcount);
10538 }
10539 else {
10540 int rkind = skind;
10541 char *res;
10542 Py_ssize_t i;
10543
10544 if (kind1 < rkind) {
10545 /* widen substring */
10546 buf1 = _PyUnicode_AsKind(str1, rkind);
10547 if (!buf1) goto error;
10548 release1 = 1;
10549 }
10550 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10551 if (i < 0)
10552 goto nothing;
10553 if (rkind > kind2) {
10554 /* widen replacement */
10555 buf2 = _PyUnicode_AsKind(str2, rkind);
10556 if (!buf2) goto error;
10557 release2 = 1;
10558 }
10559 else if (rkind < kind2) {
10560 /* widen self and buf1 */
10561 rkind = kind2;
10562 if (release1) PyMem_Free(buf1);
10563 release1 = 0;
10564 sbuf = _PyUnicode_AsKind(self, rkind);
10565 if (!sbuf) goto error;
10566 srelease = 1;
10567 buf1 = _PyUnicode_AsKind(str1, rkind);
10568 if (!buf1) goto error;
10569 release1 = 1;
10570 }
10571 u = PyUnicode_New(slen, maxchar);
10572 if (!u)
10573 goto error;
10574 assert(PyUnicode_KIND(u) == rkind);
10575 res = PyUnicode_DATA(u);
10576
10577 memcpy(res, sbuf, rkind * slen);
10578 /* change everything in-place, starting with this one */
10579 memcpy(res + rkind * i,
10580 buf2,
10581 rkind * len2);
10582 i += len1;
10583
10584 while ( --maxcount > 0) {
10585 i = anylib_find(rkind, self,
10586 sbuf+rkind*i, slen-i,
10587 str1, buf1, len1, i);
10588 if (i == -1)
10589 break;
10590 memcpy(res + rkind * i,
10591 buf2,
10592 rkind * len2);
10593 i += len1;
10594 }
10595 }
10596 }
10597 else {
10598 Py_ssize_t n, i, j, ires;
10599 Py_ssize_t new_size;
10600 int rkind = skind;
10601 char *res;
10602
10603 if (kind1 < rkind) {
10604 /* widen substring */
10605 buf1 = _PyUnicode_AsKind(str1, rkind);
10606 if (!buf1) goto error;
10607 release1 = 1;
10608 }
10609 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10610 if (n == 0)
10611 goto nothing;
10612 if (kind2 < rkind) {
10613 /* widen replacement */
10614 buf2 = _PyUnicode_AsKind(str2, rkind);
10615 if (!buf2) goto error;
10616 release2 = 1;
10617 }
10618 else if (kind2 > rkind) {
10619 /* widen self and buf1 */
10620 rkind = kind2;
10621 sbuf = _PyUnicode_AsKind(self, rkind);
10622 if (!sbuf) goto error;
10623 srelease = 1;
10624 if (release1) PyMem_Free(buf1);
10625 release1 = 0;
10626 buf1 = _PyUnicode_AsKind(str1, rkind);
10627 if (!buf1) goto error;
10628 release1 = 1;
10629 }
10630 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10631 PyUnicode_GET_LENGTH(str1))); */
10632 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10633 PyErr_SetString(PyExc_OverflowError,
10634 "replace string is too long");
10635 goto error;
10636 }
10637 new_size = slen + n * (len2 - len1);
10638 if (new_size == 0) {
10639 _Py_INCREF_UNICODE_EMPTY();
10640 if (!unicode_empty)
10641 goto error;
10642 u = unicode_empty;
10643 goto done;
10644 }
10645 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10646 PyErr_SetString(PyExc_OverflowError,
10647 "replace string is too long");
10648 goto error;
10649 }
10650 u = PyUnicode_New(new_size, maxchar);
10651 if (!u)
10652 goto error;
10653 assert(PyUnicode_KIND(u) == rkind);
10654 res = PyUnicode_DATA(u);
10655 ires = i = 0;
10656 if (len1 > 0) {
10657 while (n-- > 0) {
10658 /* look for next match */
10659 j = anylib_find(rkind, self,
10660 sbuf + rkind * i, slen-i,
10661 str1, buf1, len1, i);
10662 if (j == -1)
10663 break;
10664 else if (j > i) {
10665 /* copy unchanged part [i:j] */
10666 memcpy(res + rkind * ires,
10667 sbuf + rkind * i,
10668 rkind * (j-i));
10669 ires += j - i;
10670 }
10671 /* copy substitution string */
10672 if (len2 > 0) {
10673 memcpy(res + rkind * ires,
10674 buf2,
10675 rkind * len2);
10676 ires += len2;
10677 }
10678 i = j + len1;
10679 }
10680 if (i < slen)
10681 /* copy tail [i:] */
10682 memcpy(res + rkind * ires,
10683 sbuf + rkind * i,
10684 rkind * (slen-i));
10685 }
10686 else {
10687 /* interleave */
10688 while (n > 0) {
10689 memcpy(res + rkind * ires,
10690 buf2,
10691 rkind * len2);
10692 ires += len2;
10693 if (--n <= 0)
10694 break;
10695 memcpy(res + rkind * ires,
10696 sbuf + rkind * i,
10697 rkind);
10698 ires++;
10699 i++;
10700 }
10701 memcpy(res + rkind * ires,
10702 sbuf + rkind * i,
10703 rkind * (slen-i));
10704 }
10705 }
10706
10707 if (mayshrink) {
10708 unicode_adjust_maxchar(&u);
10709 if (u == NULL)
10710 goto error;
10711 }
10712
10713 done:
10714 if (srelease)
10715 PyMem_FREE(sbuf);
10716 if (release1)
10717 PyMem_FREE(buf1);
10718 if (release2)
10719 PyMem_FREE(buf2);
10720 assert(_PyUnicode_CheckConsistency(u, 1));
10721 return u;
10722
10723 nothing:
10724 /* nothing to replace; return original string (when possible) */
10725 if (srelease)
10726 PyMem_FREE(sbuf);
10727 if (release1)
10728 PyMem_FREE(buf1);
10729 if (release2)
10730 PyMem_FREE(buf2);
10731 return unicode_result_unchanged(self);
10732
10733 error:
10734 if (srelease && sbuf)
10735 PyMem_FREE(sbuf);
10736 if (release1 && buf1)
10737 PyMem_FREE(buf1);
10738 if (release2 && buf2)
10739 PyMem_FREE(buf2);
10740 return NULL;
10741}
10742
10743/* --- Unicode Object Methods --------------------------------------------- */
10744
10745/*[clinic input]
10746str.title as unicode_title
10747
10748Return a version of the string where each word is titlecased.
10749
10750More specifically, words start with uppercased characters and all remaining
10751cased characters have lower case.
10752[clinic start generated code]*/
10753
10754static PyObject *
10755unicode_title_impl(PyObject *self)
10756/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10757{
10758 if (PyUnicode_READY(self) == -1)
10759 return NULL;
10760 return case_operation(self, do_title);
10761}
10762
10763/*[clinic input]
10764str.capitalize as unicode_capitalize
10765
10766Return a capitalized version of the string.
10767
10768More specifically, make the first character have upper case and the rest lower
10769case.
10770[clinic start generated code]*/
10771
10772static PyObject *
10773unicode_capitalize_impl(PyObject *self)
10774/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10775{
10776 if (PyUnicode_READY(self) == -1)
10777 return NULL;
10778 if (PyUnicode_GET_LENGTH(self) == 0)
10779 return unicode_result_unchanged(self);
10780 return case_operation(self, do_capitalize);
10781}
10782
10783/*[clinic input]
10784str.casefold as unicode_casefold
10785
10786Return a version of the string suitable for caseless comparisons.
10787[clinic start generated code]*/
10788
10789static PyObject *
10790unicode_casefold_impl(PyObject *self)
10791/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10792{
10793 if (PyUnicode_READY(self) == -1)
10794 return NULL;
10795 if (PyUnicode_IS_ASCII(self))
10796 return ascii_upper_or_lower(self, 1);
10797 return case_operation(self, do_casefold);
10798}
10799
10800
10801/* Argument converter. Accepts a single Unicode character. */
10802
10803static int
10804convert_uc(PyObject *obj, void *addr)
10805{
10806 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10807
10808 if (!PyUnicode_Check(obj)) {
10809 PyErr_Format(PyExc_TypeError,
10810 "The fill character must be a unicode character, "
10811 "not %.100s", Py_TYPE(obj)->tp_name);
10812 return 0;
10813 }
10814 if (PyUnicode_READY(obj) < 0)
10815 return 0;
10816 if (PyUnicode_GET_LENGTH(obj) != 1) {
10817 PyErr_SetString(PyExc_TypeError,
10818 "The fill character must be exactly one character long");
10819 return 0;
10820 }
10821 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10822 return 1;
10823}
10824
10825/*[clinic input]
10826str.center as unicode_center
10827
10828 width: Py_ssize_t
10829 fillchar: Py_UCS4 = ' '
10830 /
10831
10832Return a centered string of length width.
10833
10834Padding is done using the specified fill character (default is a space).
10835[clinic start generated code]*/
10836
10837static PyObject *
10838unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10839/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10840{
10841 Py_ssize_t marg, left;
10842
10843 if (PyUnicode_READY(self) == -1)
10844 return NULL;
10845
10846 if (PyUnicode_GET_LENGTH(self) >= width)
10847 return unicode_result_unchanged(self);
10848
10849 marg = width - PyUnicode_GET_LENGTH(self);
10850 left = marg / 2 + (marg & width & 1);
10851
10852 return pad(self, left, marg - left, fillchar);
10853}
10854
10855/* This function assumes that str1 and str2 are readied by the caller. */
10856
10857static int
10858unicode_compare(PyObject *str1, PyObject *str2)
10859{
10860#define COMPARE(TYPE1, TYPE2) \
10861 do { \
10862 TYPE1* p1 = (TYPE1 *)data1; \
10863 TYPE2* p2 = (TYPE2 *)data2; \
10864 TYPE1* end = p1 + len; \
10865 Py_UCS4 c1, c2; \
10866 for (; p1 != end; p1++, p2++) { \
10867 c1 = *p1; \
10868 c2 = *p2; \
10869 if (c1 != c2) \
10870 return (c1 < c2) ? -1 : 1; \
10871 } \
10872 } \
10873 while (0)
10874
10875 int kind1, kind2;
10876 void *data1, *data2;
10877 Py_ssize_t len1, len2, len;
10878
10879 kind1 = PyUnicode_KIND(str1);
10880 kind2 = PyUnicode_KIND(str2);
10881 data1 = PyUnicode_DATA(str1);
10882 data2 = PyUnicode_DATA(str2);
10883 len1 = PyUnicode_GET_LENGTH(str1);
10884 len2 = PyUnicode_GET_LENGTH(str2);
10885 len = Py_MIN(len1, len2);
10886
10887 switch(kind1) {
10888 case PyUnicode_1BYTE_KIND:
10889 {
10890 switch(kind2) {
10891 case PyUnicode_1BYTE_KIND:
10892 {
10893 int cmp = memcmp(data1, data2, len);
10894 /* normalize result of memcmp() into the range [-1; 1] */
10895 if (cmp < 0)
10896 return -1;
10897 if (cmp > 0)
10898 return 1;
10899 break;
10900 }
10901 case PyUnicode_2BYTE_KIND:
10902 COMPARE(Py_UCS1, Py_UCS2);
10903 break;
10904 case PyUnicode_4BYTE_KIND:
10905 COMPARE(Py_UCS1, Py_UCS4);
10906 break;
10907 default:
10908 assert(0);
10909 }
10910 break;
10911 }
10912 case PyUnicode_2BYTE_KIND:
10913 {
10914 switch(kind2) {
10915 case PyUnicode_1BYTE_KIND:
10916 COMPARE(Py_UCS2, Py_UCS1);
10917 break;
10918 case PyUnicode_2BYTE_KIND:
10919 {
10920 COMPARE(Py_UCS2, Py_UCS2);
10921 break;
10922 }
10923 case PyUnicode_4BYTE_KIND:
10924 COMPARE(Py_UCS2, Py_UCS4);
10925 break;
10926 default:
10927 assert(0);
10928 }
10929 break;
10930 }
10931 case PyUnicode_4BYTE_KIND:
10932 {
10933 switch(kind2) {
10934 case PyUnicode_1BYTE_KIND:
10935 COMPARE(Py_UCS4, Py_UCS1);
10936 break;
10937 case PyUnicode_2BYTE_KIND:
10938 COMPARE(Py_UCS4, Py_UCS2);
10939 break;
10940 case PyUnicode_4BYTE_KIND:
10941 {
10942#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10943 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10944 /* normalize result of wmemcmp() into the range [-1; 1] */
10945 if (cmp < 0)
10946 return -1;
10947 if (cmp > 0)
10948 return 1;
10949#else
10950 COMPARE(Py_UCS4, Py_UCS4);
10951#endif
10952 break;
10953 }
10954 default:
10955 assert(0);
10956 }
10957 break;
10958 }
10959 default:
10960 assert(0);
10961 }
10962
10963 if (len1 == len2)
10964 return 0;
10965 if (len1 < len2)
10966 return -1;
10967 else
10968 return 1;
10969
10970#undef COMPARE
10971}
10972
10973static int
10974unicode_compare_eq(PyObject *str1, PyObject *str2)
10975{
10976 int kind;
10977 void *data1, *data2;
10978 Py_ssize_t len;
10979 int cmp;
10980
10981 len = PyUnicode_GET_LENGTH(str1);
10982 if (PyUnicode_GET_LENGTH(str2) != len)
10983 return 0;
10984 kind = PyUnicode_KIND(str1);
10985 if (PyUnicode_KIND(str2) != kind)
10986 return 0;
10987 data1 = PyUnicode_DATA(str1);
10988 data2 = PyUnicode_DATA(str2);
10989
10990 cmp = memcmp(data1, data2, len * kind);
10991 return (cmp == 0);
10992}
10993
10994
10995int
10996PyUnicode_Compare(PyObject *left, PyObject *right)
10997{
10998 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10999 if (PyUnicode_READY(left) == -1 ||
11000 PyUnicode_READY(right) == -1)
11001 return -1;
11002
11003 /* a string is equal to itself */
11004 if (left == right)
11005 return 0;
11006
11007 return unicode_compare(left, right);
11008 }
11009 PyErr_Format(PyExc_TypeError,
11010 "Can't compare %.100s and %.100s",
11011 left->ob_type->tp_name,
11012 right->ob_type->tp_name);
11013 return -1;
11014}
11015
11016int
11017PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11018{
11019 Py_ssize_t i;
11020 int kind;
11021 Py_UCS4 chr;
11022 const unsigned char *ustr = (const unsigned char *)str;
11023
11024 assert(_PyUnicode_CHECK(uni));
11025 if (!PyUnicode_IS_READY(uni)) {
11026 const wchar_t *ws = _PyUnicode_WSTR(uni);
11027 /* Compare Unicode string and source character set string */
11028 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11029 if (chr != ustr[i])
11030 return (chr < ustr[i]) ? -1 : 1;
11031 }
11032 /* This check keeps Python strings that end in '\0' from comparing equal
11033 to C strings identical up to that point. */
11034 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11035 return 1; /* uni is longer */
11036 if (ustr[i])
11037 return -1; /* str is longer */
11038 return 0;
11039 }
11040 kind = PyUnicode_KIND(uni);
11041 if (kind == PyUnicode_1BYTE_KIND) {
11042 const void *data = PyUnicode_1BYTE_DATA(uni);
11043 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11044 size_t len, len2 = strlen(str);
11045 int cmp;
11046
11047 len = Py_MIN(len1, len2);
11048 cmp = memcmp(data, str, len);
11049 if (cmp != 0) {
11050 if (cmp < 0)
11051 return -1;
11052 else
11053 return 1;
11054 }
11055 if (len1 > len2)
11056 return 1; /* uni is longer */
11057 if (len1 < len2)
11058 return -1; /* str is longer */
11059 return 0;
11060 }
11061 else {
11062 void *data = PyUnicode_DATA(uni);
11063 /* Compare Unicode string and source character set string */
11064 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11065 if (chr != (unsigned char)str[i])
11066 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11067 /* This check keeps Python strings that end in '\0' from comparing equal
11068 to C strings identical up to that point. */
11069 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11070 return 1; /* uni is longer */
11071 if (str[i])
11072 return -1; /* str is longer */
11073 return 0;
11074 }
11075}
11076
11077static int
11078non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11079{
11080 size_t i, len;
11081 const wchar_t *p;
11082 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11083 if (strlen(str) != len)
11084 return 0;
11085 p = _PyUnicode_WSTR(unicode);
11086 assert(p);
11087 for (i = 0; i < len; i++) {
11088 unsigned char c = (unsigned char)str[i];
11089 if (c >= 128 || p[i] != (wchar_t)c)
11090 return 0;
11091 }
11092 return 1;
11093}
11094
11095int
11096_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11097{
11098 size_t len;
11099 assert(_PyUnicode_CHECK(unicode));
11100 assert(str);
11101#ifndef NDEBUG
11102 for (const char *p = str; *p; p++) {
11103 assert((unsigned char)*p < 128);
11104 }
11105#endif
11106 if (PyUnicode_READY(unicode) == -1) {
11107 /* Memory error or bad data */
11108 PyErr_Clear();
11109 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11110 }
11111 if (!PyUnicode_IS_ASCII(unicode))
11112 return 0;
11113 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11114 return strlen(str) == len &&
11115 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11116}
11117
11118int
11119_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11120{
11121 PyObject *right_uni;
11122 Py_hash_t hash;
11123
11124 assert(_PyUnicode_CHECK(left));
11125 assert(right->string);
11126#ifndef NDEBUG
11127 for (const char *p = right->string; *p; p++) {
11128 assert((unsigned char)*p < 128);
11129 }
11130#endif
11131
11132 if (PyUnicode_READY(left) == -1) {
11133 /* memory error or bad data */
11134 PyErr_Clear();
11135 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11136 }
11137
11138 if (!PyUnicode_IS_ASCII(left))
11139 return 0;
11140
11141 right_uni = _PyUnicode_FromId(right); /* borrowed */
11142 if (right_uni == NULL) {
11143 /* memory error or bad data */
11144 PyErr_Clear();
11145 return _PyUnicode_EqualToASCIIString(left, right->string);
11146 }
11147
11148 if (left == right_uni)
11149 return 1;
11150
11151 if (PyUnicode_CHECK_INTERNED(left))
11152 return 0;
11153
11154 assert(_PyUnicode_HASH(right_uni) != 1);
11155 hash = _PyUnicode_HASH(left);
11156 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11157 return 0;
11158
11159 return unicode_compare_eq(left, right_uni);
11160}
11161
11162#define TEST_COND(cond) \
11163 ((cond) ? Py_True : Py_False)
11164
11165PyObject *
11166PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11167{
11168 int result;
11169 PyObject *v;
11170
11171 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11172 Py_RETURN_NOTIMPLEMENTED;
11173
11174 if (PyUnicode_READY(left) == -1 ||
11175 PyUnicode_READY(right) == -1)
11176 return NULL;
11177
11178 if (left == right) {
11179 switch (op) {
11180 case Py_EQ:
11181 case Py_LE:
11182 case Py_GE:
11183 /* a string is equal to itself */
11184 v = Py_True;
11185 break;
11186 case Py_NE:
11187 case Py_LT:
11188 case Py_GT:
11189 v = Py_False;
11190 break;
11191 default:
11192 PyErr_BadArgument();
11193 return NULL;
11194 }
11195 }
11196 else if (op == Py_EQ || op == Py_NE) {
11197 result = unicode_compare_eq(left, right);
11198 result ^= (op == Py_NE);
11199 v = TEST_COND(result);
11200 }
11201 else {
11202 result = unicode_compare(left, right);
11203
11204 /* Convert the return value to a Boolean */
11205 switch (op) {
11206 case Py_LE:
11207 v = TEST_COND(result <= 0);
11208 break;
11209 case Py_GE:
11210 v = TEST_COND(result >= 0);
11211 break;
11212 case Py_LT:
11213 v = TEST_COND(result == -1);
11214 break;
11215 case Py_GT:
11216 v = TEST_COND(result == 1);
11217 break;
11218 default:
11219 PyErr_BadArgument();
11220 return NULL;
11221 }
11222 }
11223 Py_INCREF(v);
11224 return v;
11225}
11226
11227int
11228_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11229{
11230 return unicode_eq(aa, bb);
11231}
11232
11233int
11234PyUnicode_Contains(PyObject *str, PyObject *substr)
11235{
11236 int kind1, kind2;
11237 void *buf1, *buf2;
11238 Py_ssize_t len1, len2;
11239 int result;
11240
11241 if (!PyUnicode_Check(substr)) {
11242 PyErr_Format(PyExc_TypeError,
11243 "'in <string>' requires string as left operand, not %.100s",
11244 Py_TYPE(substr)->tp_name);
11245 return -1;
11246 }
11247 if (PyUnicode_READY(substr) == -1)
11248 return -1;
11249 if (ensure_unicode(str) < 0)
11250 return -1;
11251
11252 kind1 = PyUnicode_KIND(str);
11253 kind2 = PyUnicode_KIND(substr);
11254 if (kind1 < kind2)
11255 return 0;
11256 len1 = PyUnicode_GET_LENGTH(str);
11257 len2 = PyUnicode_GET_LENGTH(substr);
11258 if (len1 < len2)
11259 return 0;
11260 buf1 = PyUnicode_DATA(str);
11261 buf2 = PyUnicode_DATA(substr);
11262 if (len2 == 1) {
11263 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11264 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11265 return result;
11266 }
11267 if (kind2 != kind1) {
11268 buf2 = _PyUnicode_AsKind(substr, kind1);
11269 if (!buf2)
11270 return -1;
11271 }
11272
11273 switch (kind1) {
11274 case PyUnicode_1BYTE_KIND:
11275 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11276 break;
11277 case PyUnicode_2BYTE_KIND:
11278 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11279 break;
11280 case PyUnicode_4BYTE_KIND:
11281 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11282 break;
11283 default:
11284 result = -1;
11285 assert(0);
11286 }
11287
11288 if (kind2 != kind1)
11289 PyMem_Free(buf2);
11290
11291 return result;
11292}
11293
11294/* Concat to string or Unicode object giving a new Unicode object. */
11295
11296PyObject *
11297PyUnicode_Concat(PyObject *left, PyObject *right)
11298{
11299 PyObject *result;
11300 Py_UCS4 maxchar, maxchar2;
11301 Py_ssize_t left_len, right_len, new_len;
11302
11303 if (ensure_unicode(left) < 0)
11304 return NULL;
11305
11306 if (!PyUnicode_Check(right)) {
11307 PyErr_Format(PyExc_TypeError,
11308 "can only concatenate str (not \"%.200s\") to str",
11309 right->ob_type->tp_name);
11310 return NULL;
11311 }
11312 if (PyUnicode_READY(right) < 0)
11313 return NULL;
11314
11315 /* Shortcuts */
11316 if (left == unicode_empty)
11317 return PyUnicode_FromObject(right);
11318 if (right == unicode_empty)
11319 return PyUnicode_FromObject(left);
11320
11321 left_len = PyUnicode_GET_LENGTH(left);
11322 right_len = PyUnicode_GET_LENGTH(right);
11323 if (left_len > PY_SSIZE_T_MAX - right_len) {
11324 PyErr_SetString(PyExc_OverflowError,
11325 "strings are too large to concat");
11326 return NULL;
11327 }
11328 new_len = left_len + right_len;
11329
11330 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11331 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11332 maxchar = Py_MAX(maxchar, maxchar2);
11333
11334 /* Concat the two Unicode strings */
11335 result = PyUnicode_New(new_len, maxchar);
11336 if (result == NULL)
11337 return NULL;
11338 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11339 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11340 assert(_PyUnicode_CheckConsistency(result, 1));
11341 return result;
11342}
11343
11344void
11345PyUnicode_Append(PyObject **p_left, PyObject *right)
11346{
11347 PyObject *left, *res;
11348 Py_UCS4 maxchar, maxchar2;
11349 Py_ssize_t left_len, right_len, new_len;
11350
11351 if (p_left == NULL) {
11352 if (!PyErr_Occurred())
11353 PyErr_BadInternalCall();
11354 return;
11355 }
11356 left = *p_left;
11357 if (right == NULL || left == NULL
11358 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11359 if (!PyErr_Occurred())
11360 PyErr_BadInternalCall();
11361 goto error;
11362 }
11363
11364 if (PyUnicode_READY(left) == -1)
11365 goto error;
11366 if (PyUnicode_READY(right) == -1)
11367 goto error;
11368
11369 /* Shortcuts */
11370 if (left == unicode_empty) {
11371 Py_DECREF(left);
11372 Py_INCREF(right);
11373 *p_left = right;
11374 return;
11375 }
11376 if (right == unicode_empty)
11377 return;
11378
11379 left_len = PyUnicode_GET_LENGTH(left);
11380 right_len = PyUnicode_GET_LENGTH(right);
11381 if (left_len > PY_SSIZE_T_MAX - right_len) {
11382 PyErr_SetString(PyExc_OverflowError,
11383 "strings are too large to concat");
11384 goto error;
11385 }
11386 new_len = left_len + right_len;
11387
11388 if (unicode_modifiable(left)
11389 && PyUnicode_CheckExact(right)
11390 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11391 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11392 to change the structure size, but characters are stored just after
11393 the structure, and so it requires to move all characters which is
11394 not so different than duplicating the string. */
11395 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11396 {
11397 /* append inplace */
11398 if (unicode_resize(p_left, new_len) != 0)
11399 goto error;
11400
11401 /* copy 'right' into the newly allocated area of 'left' */
11402 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11403 }
11404 else {
11405 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11406 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11407 maxchar = Py_MAX(maxchar, maxchar2);
11408
11409 /* Concat the two Unicode strings */
11410 res = PyUnicode_New(new_len, maxchar);
11411 if (res == NULL)
11412 goto error;
11413 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11414 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11415 Py_DECREF(left);
11416 *p_left = res;
11417 }
11418 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11419 return;
11420
11421error:
11422 Py_CLEAR(*p_left);
11423}
11424
11425void
11426PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11427{
11428 PyUnicode_Append(pleft, right);
11429 Py_XDECREF(right);
11430}
11431
11432/*
11433Wraps stringlib_parse_args_finds() and additionally ensures that the
11434first argument is a unicode object.
11435*/
11436
11437static inline int
11438parse_args_finds_unicode(const char * function_name, PyObject *args,
11439 PyObject **substring,
11440 Py_ssize_t *start, Py_ssize_t *end)
11441{
11442 if(stringlib_parse_args_finds(function_name, args, substring,
11443 start, end)) {
11444 if (ensure_unicode(*substring) < 0)
11445 return 0;
11446 return 1;
11447 }
11448 return 0;
11449}
11450
11451PyDoc_STRVAR(count__doc__,
11452 "S.count(sub[, start[, end]]) -> int\n\
11453\n\
11454Return the number of non-overlapping occurrences of substring sub in\n\
11455string S[start:end]. Optional arguments start and end are\n\
11456interpreted as in slice notation.");
11457
11458static PyObject *
11459unicode_count(PyObject *self, PyObject *args)
11460{
11461 PyObject *substring = NULL; /* initialize to fix a compiler warning */
11462 Py_ssize_t start = 0;
11463 Py_ssize_t end = PY_SSIZE_T_MAX;
11464 PyObject *result;
11465 int kind1, kind2;
11466 void *buf1, *buf2;
11467 Py_ssize_t len1, len2, iresult;
11468
11469 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11470 return NULL;
11471
11472 kind1 = PyUnicode_KIND(self);
11473 kind2 = PyUnicode_KIND(substring);
11474 if (kind1 < kind2)
11475 return PyLong_FromLong(0);
11476
11477 len1 = PyUnicode_GET_LENGTH(self);
11478 len2 = PyUnicode_GET_LENGTH(substring);
11479 ADJUST_INDICES(start, end, len1);
11480 if (end - start < len2)
11481 return PyLong_FromLong(0);
11482
11483 buf1 = PyUnicode_DATA(self);
11484 buf2 = PyUnicode_DATA(substring);
11485 if (kind2 != kind1) {
11486 buf2 = _PyUnicode_AsKind(substring, kind1);
11487 if (!buf2)
11488 return NULL;
11489 }
11490 switch (kind1) {
11491 case PyUnicode_1BYTE_KIND:
11492 iresult = ucs1lib_count(
11493 ((Py_UCS1*)buf1) + start, end - start,
11494 buf2, len2, PY_SSIZE_T_MAX
11495 );
11496 break;
11497 case PyUnicode_2BYTE_KIND:
11498 iresult = ucs2lib_count(
11499 ((Py_UCS2*)buf1) + start, end - start,
11500 buf2, len2, PY_SSIZE_T_MAX
11501 );
11502 break;
11503 case PyUnicode_4BYTE_KIND:
11504 iresult = ucs4lib_count(
11505 ((Py_UCS4*)buf1) + start, end - start,
11506 buf2, len2, PY_SSIZE_T_MAX
11507 );
11508 break;
11509 default:
11510 assert(0); iresult = 0;
11511 }
11512
11513 result = PyLong_FromSsize_t(iresult);
11514
11515 if (kind2 != kind1)
11516 PyMem_Free(buf2);
11517
11518 return result;
11519}
11520
11521/*[clinic input]
11522str.encode as unicode_encode
11523
11524 encoding: str(c_default="NULL") = 'utf-8'
11525 The encoding in which to encode the string.
11526 errors: str(c_default="NULL") = 'strict'
11527 The error handling scheme to use for encoding errors.
11528 The default is 'strict' meaning that encoding errors raise a
11529 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11530 'xmlcharrefreplace' as well as any other name registered with
11531 codecs.register_error that can handle UnicodeEncodeErrors.
11532
11533Encode the string using the codec registered for encoding.
11534[clinic start generated code]*/
11535
11536static PyObject *
11537unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11538/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11539{
11540 return PyUnicode_AsEncodedString(self, encoding, errors);
11541}
11542
11543/*[clinic input]
11544str.expandtabs as unicode_expandtabs
11545
11546 tabsize: int = 8
11547
11548Return a copy where all tab characters are expanded using spaces.
11549
11550If tabsize is not given, a tab size of 8 characters is assumed.
11551[clinic start generated code]*/
11552
11553static PyObject *
11554unicode_expandtabs_impl(PyObject *self, int tabsize)
11555/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11556{
11557 Py_ssize_t i, j, line_pos, src_len, incr;
11558 Py_UCS4 ch;
11559 PyObject *u;
11560 void *src_data, *dest_data;
11561 int kind;
11562 int found;
11563
11564 if (PyUnicode_READY(self) == -1)
11565 return NULL;
11566
11567 /* First pass: determine size of output string */
11568 src_len = PyUnicode_GET_LENGTH(self);
11569 i = j = line_pos = 0;
11570 kind = PyUnicode_KIND(self);
11571 src_data = PyUnicode_DATA(self);
11572 found = 0;
11573 for (; i < src_len; i++) {
11574 ch = PyUnicode_READ(kind, src_data, i);
11575 if (ch == '\t') {
11576 found = 1;
11577 if (tabsize > 0) {
11578 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11579 if (j > PY_SSIZE_T_MAX - incr)
11580 goto overflow;
11581 line_pos += incr;
11582 j += incr;
11583 }
11584 }
11585 else {
11586 if (j > PY_SSIZE_T_MAX - 1)
11587 goto overflow;
11588 line_pos++;
11589 j++;
11590 if (ch == '\n' || ch == '\r')
11591 line_pos = 0;
11592 }
11593 }
11594 if (!found)
11595 return unicode_result_unchanged(self);
11596
11597 /* Second pass: create output string and fill it */
11598 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11599 if (!u)
11600 return NULL;
11601 dest_data = PyUnicode_DATA(u);
11602
11603 i = j = line_pos = 0;
11604
11605 for (; i < src_len; i++) {
11606 ch = PyUnicode_READ(kind, src_data, i);
11607 if (ch == '\t') {
11608 if (tabsize > 0) {
11609 incr = tabsize - (line_pos % tabsize);
11610 line_pos += incr;
11611 FILL(kind, dest_data, ' ', j, incr);
11612 j += incr;
11613 }
11614 }
11615 else {
11616 line_pos++;
11617 PyUnicode_WRITE(kind, dest_data, j, ch);
11618 j++;
11619 if (ch == '\n' || ch == '\r')
11620 line_pos = 0;
11621 }
11622 }
11623 assert (j == PyUnicode_GET_LENGTH(u));
11624 return unicode_result(u);
11625
11626 overflow:
11627 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11628 return NULL;
11629}
11630
11631PyDoc_STRVAR(find__doc__,
11632 "S.find(sub[, start[, end]]) -> int\n\
11633\n\
11634Return the lowest index in S where substring sub is found,\n\
11635such that sub is contained within S[start:end]. Optional\n\
11636arguments start and end are interpreted as in slice notation.\n\
11637\n\
11638Return -1 on failure.");
11639
11640static PyObject *
11641unicode_find(PyObject *self, PyObject *args)
11642{
11643 /* initialize variables to prevent gcc warning */
11644 PyObject *substring = NULL;
11645 Py_ssize_t start = 0;
11646 Py_ssize_t end = 0;
11647 Py_ssize_t result;
11648
11649 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11650 return NULL;
11651
11652 if (PyUnicode_READY(self) == -1)
11653 return NULL;
11654
11655 result = any_find_slice(self, substring, start, end, 1);
11656
11657 if (result == -2)
11658 return NULL;
11659
11660 return PyLong_FromSsize_t(result);
11661}
11662
11663static PyObject *
11664unicode_getitem(PyObject *self, Py_ssize_t index)
11665{
11666 void *data;
11667 enum PyUnicode_Kind kind;
11668 Py_UCS4 ch;
11669
11670 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11671 PyErr_BadArgument();
11672 return NULL;
11673 }
11674 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11675 PyErr_SetString(PyExc_IndexError, "string index out of range");
11676 return NULL;
11677 }
11678 kind = PyUnicode_KIND(self);
11679 data = PyUnicode_DATA(self);
11680 ch = PyUnicode_READ(kind, data, index);
11681 return unicode_char(ch);
11682}
11683
11684/* Believe it or not, this produces the same value for ASCII strings
11685 as bytes_hash(). */
11686static Py_hash_t
11687unicode_hash(PyObject *self)
11688{
11689 Py_ssize_t len;
11690 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11691
11692#ifdef Py_DEBUG
11693 assert(_Py_HashSecret_Initialized);
11694#endif
11695 if (_PyUnicode_HASH(self) != -1)
11696 return _PyUnicode_HASH(self);
11697 if (PyUnicode_READY(self) == -1)
11698 return -1;
11699 len = PyUnicode_GET_LENGTH(self);
11700 /*
11701 We make the hash of the empty string be 0, rather than using
11702 (prefix ^ suffix), since this slightly obfuscates the hash secret
11703 */
11704 if (len == 0) {
11705 _PyUnicode_HASH(self) = 0;
11706 return 0;
11707 }
11708 x = _Py_HashBytes(PyUnicode_DATA(self),
11709 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11710 _PyUnicode_HASH(self) = x;
11711 return x;
11712}
11713
11714PyDoc_STRVAR(index__doc__,
11715 "S.index(sub[, start[, end]]) -> int\n\
11716\n\
11717Return the lowest index in S where substring sub is found, \n\
11718such that sub is contained within S[start:end]. Optional\n\
11719arguments start and end are interpreted as in slice notation.\n\
11720\n\
11721Raises ValueError when the substring is not found.");
11722
11723static PyObject *
11724unicode_index(PyObject *self, PyObject *args)
11725{
11726 /* initialize variables to prevent gcc warning */
11727 Py_ssize_t result;
11728 PyObject *substring = NULL;
11729 Py_ssize_t start = 0;
11730 Py_ssize_t end = 0;
11731
11732 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11733 return NULL;
11734
11735 if (PyUnicode_READY(self) == -1)
11736 return NULL;
11737
11738 result = any_find_slice(self, substring, start, end, 1);
11739
11740 if (result == -2)
11741 return NULL;
11742
11743 if (result < 0) {
11744 PyErr_SetString(PyExc_ValueError, "substring not found");
11745 return NULL;
11746 }
11747
11748 return PyLong_FromSsize_t(result);
11749}
11750
11751/*[clinic input]
11752str.islower as unicode_islower
11753
11754Return True if the string is a lowercase string, False otherwise.
11755
11756A string is lowercase if all cased characters in the string are lowercase and
11757there is at least one cased character in the string.
11758[clinic start generated code]*/
11759
11760static PyObject *
11761unicode_islower_impl(PyObject *self)
11762/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11763{
11764 Py_ssize_t i, length;
11765 int kind;
11766 void *data;
11767 int cased;
11768
11769 if (PyUnicode_READY(self) == -1)
11770 return NULL;
11771 length = PyUnicode_GET_LENGTH(self);
11772 kind = PyUnicode_KIND(self);
11773 data = PyUnicode_DATA(self);
11774
11775 /* Shortcut for single character strings */
11776 if (length == 1)
11777 return PyBool_FromLong(
11778 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11779
11780 /* Special case for empty strings */
11781 if (length == 0)
11782 Py_RETURN_FALSE;
11783
11784 cased = 0;
11785 for (i = 0; i < length; i++) {
11786 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11787
11788 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11789 Py_RETURN_FALSE;
11790 else if (!cased && Py_UNICODE_ISLOWER(ch))
11791 cased = 1;
11792 }
11793 return PyBool_FromLong(cased);
11794}
11795
11796/*[clinic input]
11797str.isupper as unicode_isupper
11798
11799Return True if the string is an uppercase string, False otherwise.
11800
11801A string is uppercase if all cased characters in the string are uppercase and
11802there is at least one cased character in the string.
11803[clinic start generated code]*/
11804
11805static PyObject *
11806unicode_isupper_impl(PyObject *self)
11807/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11808{
11809 Py_ssize_t i, length;
11810 int kind;
11811 void *data;
11812 int cased;
11813
11814 if (PyUnicode_READY(self) == -1)
11815 return NULL;
11816 length = PyUnicode_GET_LENGTH(self);
11817 kind = PyUnicode_KIND(self);
11818 data = PyUnicode_DATA(self);
11819
11820 /* Shortcut for single character strings */
11821 if (length == 1)
11822 return PyBool_FromLong(
11823 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11824
11825 /* Special case for empty strings */
11826 if (length == 0)
11827 Py_RETURN_FALSE;
11828
11829 cased = 0;
11830 for (i = 0; i < length; i++) {
11831 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11832
11833 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11834 Py_RETURN_FALSE;
11835 else if (!cased && Py_UNICODE_ISUPPER(ch))
11836 cased = 1;
11837 }
11838 return PyBool_FromLong(cased);
11839}
11840
11841/*[clinic input]
11842str.istitle as unicode_istitle
11843
11844Return True if the string is a title-cased string, False otherwise.
11845
11846In a title-cased string, upper- and title-case characters may only
11847follow uncased characters and lowercase characters only cased ones.
11848[clinic start generated code]*/
11849
11850static PyObject *
11851unicode_istitle_impl(PyObject *self)
11852/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11853{
11854 Py_ssize_t i, length;
11855 int kind;
11856 void *data;
11857 int cased, previous_is_cased;
11858
11859 if (PyUnicode_READY(self) == -1)
11860 return NULL;
11861 length = PyUnicode_GET_LENGTH(self);
11862 kind = PyUnicode_KIND(self);
11863 data = PyUnicode_DATA(self);
11864
11865 /* Shortcut for single character strings */
11866 if (length == 1) {
11867 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11868 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11869 (Py_UNICODE_ISUPPER(ch) != 0));
11870 }
11871
11872 /* Special case for empty strings */
11873 if (length == 0)
11874 Py_RETURN_FALSE;
11875
11876 cased = 0;
11877 previous_is_cased = 0;
11878 for (i = 0; i < length; i++) {
11879 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11880
11881 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11882 if (previous_is_cased)
11883 Py_RETURN_FALSE;
11884 previous_is_cased = 1;
11885 cased = 1;
11886 }
11887 else if (Py_UNICODE_ISLOWER(ch)) {
11888 if (!previous_is_cased)
11889 Py_RETURN_FALSE;
11890 previous_is_cased = 1;
11891 cased = 1;
11892 }
11893 else
11894 previous_is_cased = 0;
11895 }
11896 return PyBool_FromLong(cased);
11897}
11898
11899/*[clinic input]
11900str.isspace as unicode_isspace
11901
11902Return True if the string is a whitespace string, False otherwise.
11903
11904A string is whitespace if all characters in the string are whitespace and there
11905is at least one character in the string.
11906[clinic start generated code]*/
11907
11908static PyObject *
11909unicode_isspace_impl(PyObject *self)
11910/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11911{
11912 Py_ssize_t i, length;
11913 int kind;
11914 void *data;
11915
11916 if (PyUnicode_READY(self) == -1)
11917 return NULL;
11918 length = PyUnicode_GET_LENGTH(self);
11919 kind = PyUnicode_KIND(self);
11920 data = PyUnicode_DATA(self);
11921
11922 /* Shortcut for single character strings */
11923 if (length == 1)
11924 return PyBool_FromLong(
11925 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11926
11927 /* Special case for empty strings */
11928 if (length == 0)
11929 Py_RETURN_FALSE;
11930
11931 for (i = 0; i < length; i++) {
11932 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11933 if (!Py_UNICODE_ISSPACE(ch))
11934 Py_RETURN_FALSE;
11935 }
11936 Py_RETURN_TRUE;
11937}
11938
11939/*[clinic input]
11940str.isalpha as unicode_isalpha
11941
11942Return True if the string is an alphabetic string, False otherwise.
11943
11944A string is alphabetic if all characters in the string are alphabetic and there
11945is at least one character in the string.
11946[clinic start generated code]*/
11947
11948static PyObject *
11949unicode_isalpha_impl(PyObject *self)
11950/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11951{
11952 Py_ssize_t i, length;
11953 int kind;
11954 void *data;
11955
11956 if (PyUnicode_READY(self) == -1)
11957 return NULL;
11958 length = PyUnicode_GET_LENGTH(self);
11959 kind = PyUnicode_KIND(self);
11960 data = PyUnicode_DATA(self);
11961
11962 /* Shortcut for single character strings */
11963 if (length == 1)
11964 return PyBool_FromLong(
11965 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11966
11967 /* Special case for empty strings */
11968 if (length == 0)
11969 Py_RETURN_FALSE;
11970
11971 for (i = 0; i < length; i++) {
11972 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11973 Py_RETURN_FALSE;
11974 }
11975 Py_RETURN_TRUE;
11976}
11977
11978/*[clinic input]
11979str.isalnum as unicode_isalnum
11980
11981Return True if the string is an alpha-numeric string, False otherwise.
11982
11983A string is alpha-numeric if all characters in the string are alpha-numeric and
11984there is at least one character in the string.
11985[clinic start generated code]*/
11986
11987static PyObject *
11988unicode_isalnum_impl(PyObject *self)
11989/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11990{
11991 int kind;
11992 void *data;
11993 Py_ssize_t len, i;
11994
11995 if (PyUnicode_READY(self) == -1)
11996 return NULL;
11997
11998 kind = PyUnicode_KIND(self);
11999 data = PyUnicode_DATA(self);
12000 len = PyUnicode_GET_LENGTH(self);
12001
12002 /* Shortcut for single character strings */
12003 if (len == 1) {
12004 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12005 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12006 }
12007
12008 /* Special case for empty strings */
12009 if (len == 0)
12010 Py_RETURN_FALSE;
12011
12012 for (i = 0; i < len; i++) {
12013 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12014 if (!Py_UNICODE_ISALNUM(ch))
12015 Py_RETURN_FALSE;
12016 }
12017 Py_RETURN_TRUE;
12018}
12019
12020/*[clinic input]
12021str.isdecimal as unicode_isdecimal
12022
12023Return True if the string is a decimal string, False otherwise.
12024
12025A string is a decimal string if all characters in the string are decimal and
12026there is at least one character in the string.
12027[clinic start generated code]*/
12028
12029static PyObject *
12030unicode_isdecimal_impl(PyObject *self)
12031/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12032{
12033 Py_ssize_t i, length;
12034 int kind;
12035 void *data;
12036
12037 if (PyUnicode_READY(self) == -1)
12038 return NULL;
12039 length = PyUnicode_GET_LENGTH(self);
12040 kind = PyUnicode_KIND(self);
12041 data = PyUnicode_DATA(self);
12042
12043 /* Shortcut for single character strings */
12044 if (length == 1)
12045 return PyBool_FromLong(
12046 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12047
12048 /* Special case for empty strings */
12049 if (length == 0)
12050 Py_RETURN_FALSE;
12051
12052 for (i = 0; i < length; i++) {
12053 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12054 Py_RETURN_FALSE;
12055 }
12056 Py_RETURN_TRUE;
12057}
12058
12059/*[clinic input]
12060str.isdigit as unicode_isdigit
12061
12062Return True if the string is a digit string, False otherwise.
12063
12064A string is a digit string if all characters in the string are digits and there
12065is at least one character in the string.
12066[clinic start generated code]*/
12067
12068static PyObject *
12069unicode_isdigit_impl(PyObject *self)
12070/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12071{
12072 Py_ssize_t i, length;
12073 int kind;
12074 void *data;
12075
12076 if (PyUnicode_READY(self) == -1)
12077 return NULL;
12078 length = PyUnicode_GET_LENGTH(self);
12079 kind = PyUnicode_KIND(self);
12080 data = PyUnicode_DATA(self);
12081
12082 /* Shortcut for single character strings */
12083 if (length == 1) {
12084 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12085 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12086 }
12087
12088 /* Special case for empty strings */
12089 if (length == 0)
12090 Py_RETURN_FALSE;
12091
12092 for (i = 0; i < length; i++) {
12093 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12094 Py_RETURN_FALSE;
12095 }
12096 Py_RETURN_TRUE;
12097}
12098
12099/*[clinic input]
12100str.isnumeric as unicode_isnumeric
12101
12102Return True if the string is a numeric string, False otherwise.
12103
12104A string is numeric if all characters in the string are numeric and there is at
12105least one character in the string.
12106[clinic start generated code]*/
12107
12108static PyObject *
12109unicode_isnumeric_impl(PyObject *self)
12110/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12111{
12112 Py_ssize_t i, length;
12113 int kind;
12114 void *data;
12115
12116 if (PyUnicode_READY(self) == -1)
12117 return NULL;
12118 length = PyUnicode_GET_LENGTH(self);
12119 kind = PyUnicode_KIND(self);
12120 data = PyUnicode_DATA(self);
12121
12122 /* Shortcut for single character strings */
12123 if (length == 1)
12124 return PyBool_FromLong(
12125 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12126
12127 /* Special case for empty strings */
12128 if (length == 0)
12129 Py_RETURN_FALSE;
12130
12131 for (i = 0; i < length; i++) {
12132 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12133 Py_RETURN_FALSE;
12134 }
12135 Py_RETURN_TRUE;
12136}
12137
12138int
12139PyUnicode_IsIdentifier(PyObject *self)
12140{
12141 int kind;
12142 void *data;
12143 Py_ssize_t i;
12144 Py_UCS4 first;
12145
12146 if (PyUnicode_READY(self) == -1) {
12147 Py_FatalError("identifier not ready");
12148 return 0;
12149 }
12150
12151 /* Special case for empty strings */
12152 if (PyUnicode_GET_LENGTH(self) == 0)
12153 return 0;
12154 kind = PyUnicode_KIND(self);
12155 data = PyUnicode_DATA(self);
12156
12157 /* PEP 3131 says that the first character must be in
12158 XID_Start and subsequent characters in XID_Continue,
12159 and for the ASCII range, the 2.x rules apply (i.e
12160 start with letters and underscore, continue with
12161 letters, digits, underscore). However, given the current
12162 definition of XID_Start and XID_Continue, it is sufficient
12163 to check just for these, except that _ must be allowed
12164 as starting an identifier. */
12165 first = PyUnicode_READ(kind, data, 0);
12166 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12167 return 0;
12168
12169 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12170 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12171 return 0;
12172 return 1;
12173}
12174
12175/*[clinic input]
12176str.isidentifier as unicode_isidentifier
12177
12178Return True if the string is a valid Python identifier, False otherwise.
12179
12180Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12181"class".
12182[clinic start generated code]*/
12183
12184static PyObject *
12185unicode_isidentifier_impl(PyObject *self)
12186/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
12187{
12188 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12189}
12190
12191/*[clinic input]
12192str.isprintable as unicode_isprintable
12193
12194Return True if the string is printable, False otherwise.
12195
12196A string is printable if all of its characters are considered printable in
12197repr() or if it is empty.
12198[clinic start generated code]*/
12199
12200static PyObject *
12201unicode_isprintable_impl(PyObject *self)
12202/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12203{
12204 Py_ssize_t i, length;
12205 int kind;
12206 void *data;
12207
12208 if (PyUnicode_READY(self) == -1)
12209 return NULL;
12210 length = PyUnicode_GET_LENGTH(self);
12211 kind = PyUnicode_KIND(self);
12212 data = PyUnicode_DATA(self);
12213
12214 /* Shortcut for single character strings */
12215 if (length == 1)
12216 return PyBool_FromLong(
12217 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12218
12219 for (i = 0; i < length; i++) {
12220 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12221 Py_RETURN_FALSE;
12222 }
12223 }
12224 Py_RETURN_TRUE;
12225}
12226
12227/*[clinic input]
12228str.join as unicode_join
12229
12230 iterable: object
12231 /
12232
12233Concatenate any number of strings.
12234
12235The string whose method is called is inserted in between each given string.
12236The result is returned as a new string.
12237
12238Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12239[clinic start generated code]*/
12240
12241static PyObject *
12242unicode_join(PyObject *self, PyObject *iterable)
12243/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12244{
12245 return PyUnicode_Join(self, iterable);
12246}
12247
12248static Py_ssize_t
12249unicode_length(PyObject *self)
12250{
12251 if (PyUnicode_READY(self) == -1)
12252 return -1;
12253 return PyUnicode_GET_LENGTH(self);
12254}
12255
12256/*[clinic input]
12257str.ljust as unicode_ljust
12258
12259 width: Py_ssize_t
12260 fillchar: Py_UCS4 = ' '
12261 /
12262
12263Return a left-justified string of length width.
12264
12265Padding is done using the specified fill character (default is a space).
12266[clinic start generated code]*/
12267
12268static PyObject *
12269unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12270/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12271{
12272 if (PyUnicode_READY(self) == -1)
12273 return NULL;
12274
12275 if (PyUnicode_GET_LENGTH(self) >= width)
12276 return unicode_result_unchanged(self);
12277
12278 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12279}
12280
12281/*[clinic input]
12282str.lower as unicode_lower
12283
12284Return a copy of the string converted to lowercase.
12285[clinic start generated code]*/
12286
12287static PyObject *
12288unicode_lower_impl(PyObject *self)
12289/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12290{
12291 if (PyUnicode_READY(self) == -1)
12292 return NULL;
12293 if (PyUnicode_IS_ASCII(self))
12294 return ascii_upper_or_lower(self, 1);
12295 return case_operation(self, do_lower);
12296}
12297
12298#define LEFTSTRIP 0
12299#define RIGHTSTRIP 1
12300#define BOTHSTRIP 2
12301
12302/* Arrays indexed by above */
12303static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12304
12305#define STRIPNAME(i) (stripfuncnames[i])
12306
12307/* externally visible for str.strip(unicode) */
12308PyObject *
12309_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12310{
12311 void *data;
12312 int kind;
12313 Py_ssize_t i, j, len;
12314 BLOOM_MASK sepmask;
12315 Py_ssize_t seplen;
12316
12317 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12318 return NULL;
12319
12320 kind = PyUnicode_KIND(self);
12321 data = PyUnicode_DATA(self);
12322 len = PyUnicode_GET_LENGTH(self);
12323 seplen = PyUnicode_GET_LENGTH(sepobj);
12324 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12325 PyUnicode_DATA(sepobj),
12326 seplen);
12327
12328 i = 0;
12329 if (striptype != RIGHTSTRIP) {
12330 while (i < len) {
12331 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12332 if (!BLOOM(sepmask, ch))
12333 break;
12334 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12335 break;
12336 i++;
12337 }
12338 }
12339
12340 j = len;
12341 if (striptype != LEFTSTRIP) {
12342 j--;
12343 while (j >= i) {
12344 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12345 if (!BLOOM(sepmask, ch))
12346 break;
12347 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12348 break;
12349 j--;
12350 }
12351
12352 j++;
12353 }
12354
12355 return PyUnicode_Substring(self, i, j);
12356}
12357
12358PyObject*
12359PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12360{
12361 unsigned char *data;
12362 int kind;
12363 Py_ssize_t length;
12364
12365 if (PyUnicode_READY(self) == -1)
12366 return NULL;
12367
12368 length = PyUnicode_GET_LENGTH(self);
12369 end = Py_MIN(end, length);
12370
12371 if (start == 0 && end == length)
12372 return unicode_result_unchanged(self);
12373
12374 if (start < 0 || end < 0) {
12375 PyErr_SetString(PyExc_IndexError, "string index out of range");
12376 return NULL;
12377 }
12378 if (start >= length || end < start)
12379 _Py_RETURN_UNICODE_EMPTY();
12380
12381 length = end - start;
12382 if (PyUnicode_IS_ASCII(self)) {
12383 data = PyUnicode_1BYTE_DATA(self);
12384 return _PyUnicode_FromASCII((char*)(data + start), length);
12385 }
12386 else {
12387 kind = PyUnicode_KIND(self);
12388 data = PyUnicode_1BYTE_DATA(self);
12389 return PyUnicode_FromKindAndData(kind,
12390 data + kind * start,
12391 length);
12392 }
12393}
12394
12395static PyObject *
12396do_strip(PyObject *self, int striptype)
12397{
12398 Py_ssize_t len, i, j;
12399
12400 if (PyUnicode_READY(self) == -1)
12401 return NULL;
12402
12403 len = PyUnicode_GET_LENGTH(self);
12404
12405 if (PyUnicode_IS_ASCII(self)) {
12406 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12407
12408 i = 0;
12409 if (striptype != RIGHTSTRIP) {
12410 while (i < len) {
12411 Py_UCS1 ch = data[i];
12412 if (!_Py_ascii_whitespace[ch])
12413 break;
12414 i++;
12415 }
12416 }
12417
12418 j = len;
12419 if (striptype != LEFTSTRIP) {
12420 j--;
12421 while (j >= i) {
12422 Py_UCS1 ch = data[j];
12423 if (!_Py_ascii_whitespace[ch])
12424 break;
12425 j--;
12426 }
12427 j++;
12428 }
12429 }
12430 else {
12431 int kind = PyUnicode_KIND(self);
12432 void *data = PyUnicode_DATA(self);
12433
12434 i = 0;
12435 if (striptype != RIGHTSTRIP) {
12436 while (i < len) {
12437 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12438 if (!Py_UNICODE_ISSPACE(ch))
12439 break;
12440 i++;
12441 }
12442 }
12443
12444 j = len;
12445 if (striptype != LEFTSTRIP) {
12446 j--;
12447 while (j >= i) {
12448 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12449 if (!Py_UNICODE_ISSPACE(ch))
12450 break;
12451 j--;
12452 }
12453 j++;
12454 }
12455 }
12456
12457 return PyUnicode_Substring(self, i, j);
12458}
12459
12460
12461static PyObject *
12462do_argstrip(PyObject *self, int striptype, PyObject *sep)
12463{
12464 if (sep != NULL && sep != Py_None) {
12465 if (PyUnicode_Check(sep))
12466 return _PyUnicode_XStrip(self, striptype, sep);
12467 else {
12468 PyErr_Format(PyExc_TypeError,
12469 "%s arg must be None or str",
12470 STRIPNAME(striptype));
12471 return NULL;
12472 }
12473 }
12474
12475 return do_strip(self, striptype);
12476}
12477
12478
12479/*[clinic input]
12480str.strip as unicode_strip
12481
12482 chars: object = None
12483 /
12484
12485Return a copy of the string with leading and trailing whitespace remove.
12486
12487If chars is given and not None, remove characters in chars instead.
12488[clinic start generated code]*/
12489
12490static PyObject *
12491unicode_strip_impl(PyObject *self, PyObject *chars)
12492/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
12493{
12494 return do_argstrip(self, BOTHSTRIP, chars);
12495}
12496
12497
12498/*[clinic input]
12499str.lstrip as unicode_lstrip
12500
12501 chars: object = NULL
12502 /
12503
12504Return a copy of the string with leading whitespace removed.
12505
12506If chars is given and not None, remove characters in chars instead.
12507[clinic start generated code]*/
12508
12509static PyObject *
12510unicode_lstrip_impl(PyObject *self, PyObject *chars)
12511/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
12512{
12513 return do_argstrip(self, LEFTSTRIP, chars);
12514}
12515
12516
12517/*[clinic input]
12518str.rstrip as unicode_rstrip
12519
12520 chars: object = NULL
12521 /
12522
12523Return a copy of the string with trailing whitespace removed.
12524
12525If chars is given and not None, remove characters in chars instead.
12526[clinic start generated code]*/
12527
12528static PyObject *
12529unicode_rstrip_impl(PyObject *self, PyObject *chars)
12530/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
12531{
12532 return do_argstrip(self, RIGHTSTRIP, chars);
12533}
12534
12535
12536static PyObject*
12537unicode_repeat(PyObject *str, Py_ssize_t len)
12538{
12539 PyObject *u;
12540 Py_ssize_t nchars, n;
12541
12542 if (len < 1)
12543 _Py_RETURN_UNICODE_EMPTY();
12544
12545 /* no repeat, return original string */
12546 if (len == 1)
12547 return unicode_result_unchanged(str);
12548
12549 if (PyUnicode_READY(str) == -1)
12550 return NULL;
12551
12552 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12553 PyErr_SetString(PyExc_OverflowError,
12554 "repeated string is too long");
12555 return NULL;
12556 }
12557 nchars = len * PyUnicode_GET_LENGTH(str);
12558
12559 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12560 if (!u)
12561 return NULL;
12562 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12563
12564 if (PyUnicode_GET_LENGTH(str) == 1) {
12565 const int kind = PyUnicode_KIND(str);
12566 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12567 if (kind == PyUnicode_1BYTE_KIND) {
12568 void *to = PyUnicode_DATA(u);
12569 memset(to, (unsigned char)fill_char, len);
12570 }
12571 else if (kind == PyUnicode_2BYTE_KIND) {
12572 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12573 for (n = 0; n < len; ++n)
12574 ucs2[n] = fill_char;
12575 } else {
12576 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12577 assert(kind == PyUnicode_4BYTE_KIND);
12578 for (n = 0; n < len; ++n)
12579 ucs4[n] = fill_char;
12580 }
12581 }
12582 else {
12583 /* number of characters copied this far */
12584 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12585 const Py_ssize_t char_size = PyUnicode_KIND(str);
12586 char *to = (char *) PyUnicode_DATA(u);
12587 memcpy(to, PyUnicode_DATA(str),
12588 PyUnicode_GET_LENGTH(str) * char_size);
12589 while (done < nchars) {
12590 n = (done <= nchars-done) ? done : nchars-done;
12591 memcpy(to + (done * char_size), to, n * char_size);
12592 done += n;
12593 }
12594 }
12595
12596 assert(_PyUnicode_CheckConsistency(u, 1));
12597 return u;
12598}
12599
12600PyObject *
12601PyUnicode_Replace(PyObject *str,
12602 PyObject *substr,
12603 PyObject *replstr,
12604 Py_ssize_t maxcount)
12605{
12606 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12607 ensure_unicode(replstr) < 0)
12608 return NULL;
12609 return replace(str, substr, replstr, maxcount);
12610}
12611
12612/*[clinic input]
12613str.replace as unicode_replace
12614
12615 old: unicode
12616 new: unicode
12617 count: Py_ssize_t = -1
12618 Maximum number of occurrences to replace.
12619 -1 (the default value) means replace all occurrences.
12620 /
12621
12622Return a copy with all occurrences of substring old replaced by new.
12623
12624If the optional argument count is given, only the first count occurrences are
12625replaced.
12626[clinic start generated code]*/
12627
12628static PyObject *
12629unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12630 Py_ssize_t count)
12631/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12632{
12633 if (PyUnicode_READY(self) == -1)
12634 return NULL;
12635 return replace(self, old, new, count);
12636}
12637
12638static PyObject *
12639unicode_repr(PyObject *unicode)
12640{
12641 PyObject *repr;
12642 Py_ssize_t isize;
12643 Py_ssize_t osize, squote, dquote, i, o;
12644 Py_UCS4 max, quote;
12645 int ikind, okind, unchanged;
12646 void *idata, *odata;
12647
12648 if (PyUnicode_READY(unicode) == -1)
12649 return NULL;
12650
12651 isize = PyUnicode_GET_LENGTH(unicode);
12652 idata = PyUnicode_DATA(unicode);
12653
12654 /* Compute length of output, quote characters, and
12655 maximum character */
12656 osize = 0;
12657 max = 127;
12658 squote = dquote = 0;
12659 ikind = PyUnicode_KIND(unicode);
12660 for (i = 0; i < isize; i++) {
12661 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12662 Py_ssize_t incr = 1;
12663 switch (ch) {
12664 case '\'': squote++; break;
12665 case '"': dquote++; break;
12666 case '\\': case '\t': case '\r': case '\n':
12667 incr = 2;
12668 break;
12669 default:
12670 /* Fast-path ASCII */
12671 if (ch < ' ' || ch == 0x7f)
12672 incr = 4; /* \xHH */
12673 else if (ch < 0x7f)
12674 ;
12675 else if (Py_UNICODE_ISPRINTABLE(ch))
12676 max = ch > max ? ch : max;
12677 else if (ch < 0x100)
12678 incr = 4; /* \xHH */
12679 else if (ch < 0x10000)
12680 incr = 6; /* \uHHHH */
12681 else
12682 incr = 10; /* \uHHHHHHHH */
12683 }
12684 if (osize > PY_SSIZE_T_MAX - incr) {
12685 PyErr_SetString(PyExc_OverflowError,
12686 "string is too long to generate repr");
12687 return NULL;
12688 }
12689 osize += incr;
12690 }
12691
12692 quote = '\'';
12693 unchanged = (osize == isize);
12694 if (squote) {
12695 unchanged = 0;
12696 if (dquote)
12697 /* Both squote and dquote present. Use squote,
12698 and escape them */
12699 osize += squote;
12700 else
12701 quote = '"';
12702 }
12703 osize += 2; /* quotes */
12704
12705 repr = PyUnicode_New(osize, max);
12706 if (repr == NULL)
12707 return NULL;
12708 okind = PyUnicode_KIND(repr);
12709 odata = PyUnicode_DATA(repr);
12710
12711 PyUnicode_WRITE(okind, odata, 0, quote);
12712 PyUnicode_WRITE(okind, odata, osize-1, quote);
12713 if (unchanged) {
12714 _PyUnicode_FastCopyCharacters(repr, 1,
12715 unicode, 0,
12716 isize);
12717 }
12718 else {
12719 for (i = 0, o = 1; i < isize; i++) {
12720 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12721
12722 /* Escape quotes and backslashes */
12723 if ((ch == quote) || (ch == '\\')) {
12724 PyUnicode_WRITE(okind, odata, o++, '\\');
12725 PyUnicode_WRITE(okind, odata, o++, ch);
12726 continue;
12727 }
12728
12729 /* Map special whitespace to '\t', \n', '\r' */
12730 if (ch == '\t') {
12731 PyUnicode_WRITE(okind, odata, o++, '\\');
12732 PyUnicode_WRITE(okind, odata, o++, 't');
12733 }
12734 else if (ch == '\n') {
12735 PyUnicode_WRITE(okind, odata, o++, '\\');
12736 PyUnicode_WRITE(okind, odata, o++, 'n');
12737 }
12738 else if (ch == '\r') {
12739 PyUnicode_WRITE(okind, odata, o++, '\\');
12740 PyUnicode_WRITE(okind, odata, o++, 'r');
12741 }
12742
12743 /* Map non-printable US ASCII to '\xhh' */
12744 else if (ch < ' ' || ch == 0x7F) {
12745 PyUnicode_WRITE(okind, odata, o++, '\\');
12746 PyUnicode_WRITE(okind, odata, o++, 'x');
12747 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12748 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12749 }
12750
12751 /* Copy ASCII characters as-is */
12752 else if (ch < 0x7F) {
12753 PyUnicode_WRITE(okind, odata, o++, ch);
12754 }
12755
12756 /* Non-ASCII characters */
12757 else {
12758 /* Map Unicode whitespace and control characters
12759 (categories Z* and C* except ASCII space)
12760 */
12761 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12762 PyUnicode_WRITE(okind, odata, o++, '\\');
12763 /* Map 8-bit characters to '\xhh' */
12764 if (ch <= 0xff) {
12765 PyUnicode_WRITE(okind, odata, o++, 'x');
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12768 }
12769 /* Map 16-bit characters to '\uxxxx' */
12770 else if (ch <= 0xffff) {
12771 PyUnicode_WRITE(okind, odata, o++, 'u');
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12776 }
12777 /* Map 21-bit characters to '\U00xxxxxx' */
12778 else {
12779 PyUnicode_WRITE(okind, odata, o++, 'U');
12780 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12781 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12782 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12783 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12784 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12785 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12786 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12787 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12788 }
12789 }
12790 /* Copy characters as-is */
12791 else {
12792 PyUnicode_WRITE(okind, odata, o++, ch);
12793 }
12794 }
12795 }
12796 }
12797 /* Closing quote already added at the beginning */
12798 assert(_PyUnicode_CheckConsistency(repr, 1));
12799 return repr;
12800}
12801
12802PyDoc_STRVAR(rfind__doc__,
12803 "S.rfind(sub[, start[, end]]) -> int\n\
12804\n\
12805Return the highest index in S where substring sub is found,\n\
12806such that sub is contained within S[start:end]. Optional\n\
12807arguments start and end are interpreted as in slice notation.\n\
12808\n\
12809Return -1 on failure.");
12810
12811static PyObject *
12812unicode_rfind(PyObject *self, PyObject *args)
12813{
12814 /* initialize variables to prevent gcc warning */
12815 PyObject *substring = NULL;
12816 Py_ssize_t start = 0;
12817 Py_ssize_t end = 0;
12818 Py_ssize_t result;
12819
12820 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12821 return NULL;
12822
12823 if (PyUnicode_READY(self) == -1)
12824 return NULL;
12825
12826 result = any_find_slice(self, substring, start, end, -1);
12827
12828 if (result == -2)
12829 return NULL;
12830
12831 return PyLong_FromSsize_t(result);
12832}
12833
12834PyDoc_STRVAR(rindex__doc__,
12835 "S.rindex(sub[, start[, end]]) -> int\n\
12836\n\
12837Return the highest index in S where substring sub is found,\n\
12838such that sub is contained within S[start:end]. Optional\n\
12839arguments start and end are interpreted as in slice notation.\n\
12840\n\
12841Raises ValueError when the substring is not found.");
12842
12843static PyObject *
12844unicode_rindex(PyObject *self, PyObject *args)
12845{
12846 /* initialize variables to prevent gcc warning */
12847 PyObject *substring = NULL;
12848 Py_ssize_t start = 0;
12849 Py_ssize_t end = 0;
12850 Py_ssize_t result;
12851
12852 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12853 return NULL;
12854
12855 if (PyUnicode_READY(self) == -1)
12856 return NULL;
12857
12858 result = any_find_slice(self, substring, start, end, -1);
12859
12860 if (result == -2)
12861 return NULL;
12862
12863 if (result < 0) {
12864 PyErr_SetString(PyExc_ValueError, "substring not found");
12865 return NULL;
12866 }
12867
12868 return PyLong_FromSsize_t(result);
12869}
12870
12871/*[clinic input]
12872str.rjust as unicode_rjust
12873
12874 width: Py_ssize_t
12875 fillchar: Py_UCS4 = ' '
12876 /
12877
12878Return a right-justified string of length width.
12879
12880Padding is done using the specified fill character (default is a space).
12881[clinic start generated code]*/
12882
12883static PyObject *
12884unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12885/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12886{
12887 if (PyUnicode_READY(self) == -1)
12888 return NULL;
12889
12890 if (PyUnicode_GET_LENGTH(self) >= width)
12891 return unicode_result_unchanged(self);
12892
12893 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12894}
12895
12896PyObject *
12897PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12898{
12899 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12900 return NULL;
12901
12902 return split(s, sep, maxsplit);
12903}
12904
12905/*[clinic input]
12906str.split as unicode_split
12907
12908 sep: object = None
12909 The delimiter according which to split the string.
12910 None (the default value) means split according to any whitespace,
12911 and discard empty strings from the result.
12912 maxsplit: Py_ssize_t = -1
12913 Maximum number of splits to do.
12914 -1 (the default value) means no limit.
12915
12916Return a list of the words in the string, using sep as the delimiter string.
12917[clinic start generated code]*/
12918
12919static PyObject *
12920unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12921/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
12922{
12923 if (sep == Py_None)
12924 return split(self, NULL, maxsplit);
12925 if (PyUnicode_Check(sep))
12926 return split(self, sep, maxsplit);
12927
12928 PyErr_Format(PyExc_TypeError,
12929 "must be str or None, not %.100s",
12930 Py_TYPE(sep)->tp_name);
12931 return NULL;
12932}
12933
12934PyObject *
12935PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12936{
12937 PyObject* out;
12938 int kind1, kind2;
12939 void *buf1, *buf2;
12940 Py_ssize_t len1, len2;
12941
12942 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12943 return NULL;
12944
12945 kind1 = PyUnicode_KIND(str_obj);
12946 kind2 = PyUnicode_KIND(sep_obj);
12947 len1 = PyUnicode_GET_LENGTH(str_obj);
12948 len2 = PyUnicode_GET_LENGTH(sep_obj);
12949 if (kind1 < kind2 || len1 < len2) {
12950 _Py_INCREF_UNICODE_EMPTY();
12951 if (!unicode_empty)
12952 out = NULL;
12953 else {
12954 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12955 Py_DECREF(unicode_empty);
12956 }
12957 return out;
12958 }
12959 buf1 = PyUnicode_DATA(str_obj);
12960 buf2 = PyUnicode_DATA(sep_obj);
12961 if (kind2 != kind1) {
12962 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12963 if (!buf2)
12964 return NULL;
12965 }
12966
12967 switch (kind1) {
12968 case PyUnicode_1BYTE_KIND:
12969 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12970 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12971 else
12972 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12973 break;
12974 case PyUnicode_2BYTE_KIND:
12975 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12976 break;
12977 case PyUnicode_4BYTE_KIND:
12978 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12979 break;
12980 default:
12981 assert(0);
12982 out = 0;
12983 }
12984
12985 if (kind2 != kind1)
12986 PyMem_Free(buf2);
12987
12988 return out;
12989}
12990
12991
12992PyObject *
12993PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12994{
12995 PyObject* out;
12996 int kind1, kind2;
12997 void *buf1, *buf2;
12998 Py_ssize_t len1, len2;
12999
13000 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13001 return NULL;
13002
13003 kind1 = PyUnicode_KIND(str_obj);
13004 kind2 = PyUnicode_KIND(sep_obj);
13005 len1 = PyUnicode_GET_LENGTH(str_obj);
13006 len2 = PyUnicode_GET_LENGTH(sep_obj);
13007 if (kind1 < kind2 || len1 < len2) {
13008 _Py_INCREF_UNICODE_EMPTY();
13009 if (!unicode_empty)
13010 out = NULL;
13011 else {
13012 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13013 Py_DECREF(unicode_empty);
13014 }
13015 return out;
13016 }
13017 buf1 = PyUnicode_DATA(str_obj);
13018 buf2 = PyUnicode_DATA(sep_obj);
13019 if (kind2 != kind1) {
13020 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13021 if (!buf2)
13022 return NULL;
13023 }
13024
13025 switch (kind1) {
13026 case PyUnicode_1BYTE_KIND:
13027 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13028 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13029 else
13030 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13031 break;
13032 case PyUnicode_2BYTE_KIND:
13033 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13034 break;
13035 case PyUnicode_4BYTE_KIND:
13036 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13037 break;
13038 default:
13039 assert(0);
13040 out = 0;
13041 }
13042
13043 if (kind2 != kind1)
13044 PyMem_Free(buf2);
13045
13046 return out;
13047}
13048
13049/*[clinic input]
13050str.partition as unicode_partition
13051
13052 sep: object
13053 /
13054
13055Partition the string into three parts using the given separator.
13056
13057This will search for the separator in the string. If the separator is found,
13058returns a 3-tuple containing the part before the separator, the separator
13059itself, and the part after it.
13060
13061If the separator is not found, returns a 3-tuple containing the original string
13062and two empty strings.
13063[clinic start generated code]*/
13064
13065static PyObject *
13066unicode_partition(PyObject *self, PyObject *sep)
13067/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13068{
13069 return PyUnicode_Partition(self, sep);
13070}
13071
13072/*[clinic input]
13073str.rpartition as unicode_rpartition = str.partition
13074
13075Partition the string into three parts using the given separator.
13076
13077This will search for the separator in the string, starting and the end. If
13078the separator is found, returns a 3-tuple containing the part before the
13079separator, the separator itself, and the part after it.
13080
13081If the separator is not found, returns a 3-tuple containing two empty strings
13082and the original string.
13083[clinic start generated code]*/
13084
13085static PyObject *
13086unicode_rpartition(PyObject *self, PyObject *sep)
13087/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
13088{
13089 return PyUnicode_RPartition(self, sep);
13090}
13091
13092PyObject *
13093PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13094{
13095 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13096 return NULL;
13097
13098 return rsplit(s, sep, maxsplit);
13099}
13100
13101/*[clinic input]
13102str.rsplit as unicode_rsplit = str.split
13103
13104Return a list of the words in the string, using sep as the delimiter string.
13105
13106Splits are done starting at the end of the string and working to the front.
13107[clinic start generated code]*/
13108
13109static PyObject *
13110unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13111/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13112{
13113 if (sep == Py_None)
13114 return rsplit(self, NULL, maxsplit);
13115 if (PyUnicode_Check(sep))
13116 return rsplit(self, sep, maxsplit);
13117
13118 PyErr_Format(PyExc_TypeError,
13119 "must be str or None, not %.100s",
13120 Py_TYPE(sep)->tp_name);
13121 return NULL;
13122}
13123
13124/*[clinic input]
13125str.splitlines as unicode_splitlines
13126
13127 keepends: bool(accept={int}) = False
13128
13129Return a list of the lines in the string, breaking at line boundaries.
13130
13131Line breaks are not included in the resulting list unless keepends is given and
13132true.
13133[clinic start generated code]*/
13134
13135static PyObject *
13136unicode_splitlines_impl(PyObject *self, int keepends)
13137/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13138{
13139 return PyUnicode_Splitlines(self, keepends);
13140}
13141
13142static
13143PyObject *unicode_str(PyObject *self)
13144{
13145 return unicode_result_unchanged(self);
13146}
13147
13148/*[clinic input]
13149str.swapcase as unicode_swapcase
13150
13151Convert uppercase characters to lowercase and lowercase characters to uppercase.
13152[clinic start generated code]*/
13153
13154static PyObject *
13155unicode_swapcase_impl(PyObject *self)
13156/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13157{
13158 if (PyUnicode_READY(self) == -1)
13159 return NULL;
13160 return case_operation(self, do_swapcase);
13161}
13162
13163/*[clinic input]
13164
13165@staticmethod
13166str.maketrans as unicode_maketrans
13167
13168 x: object
13169
13170 y: unicode=NULL
13171
13172 z: unicode=NULL
13173
13174 /
13175
13176Return a translation table usable for str.translate().
13177
13178If there is only one argument, it must be a dictionary mapping Unicode
13179ordinals (integers) or characters to Unicode ordinals, strings or None.
13180Character keys will be then converted to ordinals.
13181If there are two arguments, they must be strings of equal length, and
13182in the resulting dictionary, each character in x will be mapped to the
13183character at the same position in y. If there is a third argument, it
13184must be a string, whose characters will be mapped to None in the result.
13185[clinic start generated code]*/
13186
13187static PyObject *
13188unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13189/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13190{
13191 PyObject *new = NULL, *key, *value;
13192 Py_ssize_t i = 0;
13193 int res;
13194
13195 new = PyDict_New();
13196 if (!new)
13197 return NULL;
13198 if (y != NULL) {
13199 int x_kind, y_kind, z_kind;
13200 void *x_data, *y_data, *z_data;
13201
13202 /* x must be a string too, of equal length */
13203 if (!PyUnicode_Check(x)) {
13204 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13205 "be a string if there is a second argument");
13206 goto err;
13207 }
13208 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13209 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13210 "arguments must have equal length");
13211 goto err;
13212 }
13213 /* create entries for translating chars in x to those in y */
13214 x_kind = PyUnicode_KIND(x);
13215 y_kind = PyUnicode_KIND(y);
13216 x_data = PyUnicode_DATA(x);
13217 y_data = PyUnicode_DATA(y);
13218 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13219 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13220 if (!key)
13221 goto err;
13222 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13223 if (!value) {
13224 Py_DECREF(key);
13225 goto err;
13226 }
13227 res = PyDict_SetItem(new, key, value);
13228 Py_DECREF(key);
13229 Py_DECREF(value);
13230 if (res < 0)
13231 goto err;
13232 }
13233 /* create entries for deleting chars in z */
13234 if (z != NULL) {
13235 z_kind = PyUnicode_KIND(z);
13236 z_data = PyUnicode_DATA(z);
13237 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13238 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13239 if (!key)
13240 goto err;
13241 res = PyDict_SetItem(new, key, Py_None);
13242 Py_DECREF(key);
13243 if (res < 0)
13244 goto err;
13245 }
13246 }
13247 } else {
13248 int kind;
13249 void *data;
13250
13251 /* x must be a dict */
13252 if (!PyDict_CheckExact(x)) {
13253 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13254 "to maketrans it must be a dict");
13255 goto err;
13256 }
13257 /* copy entries into the new dict, converting string keys to int keys */
13258 while (PyDict_Next(x, &i, &key, &value)) {
13259 if (PyUnicode_Check(key)) {
13260 /* convert string keys to integer keys */
13261 PyObject *newkey;
13262 if (PyUnicode_GET_LENGTH(key) != 1) {
13263 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13264 "table must be of length 1");
13265 goto err;
13266 }
13267 kind = PyUnicode_KIND(key);
13268 data = PyUnicode_DATA(key);
13269 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13270 if (!newkey)
13271 goto err;
13272 res = PyDict_SetItem(new, newkey, value);
13273 Py_DECREF(newkey);
13274 if (res < 0)
13275 goto err;
13276 } else if (PyLong_Check(key)) {
13277 /* just keep integer keys */
13278 if (PyDict_SetItem(new, key, value) < 0)
13279 goto err;
13280 } else {
13281 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13282 "be strings or integers");
13283 goto err;
13284 }
13285 }
13286 }
13287 return new;
13288 err:
13289 Py_DECREF(new);
13290 return NULL;
13291}
13292
13293/*[clinic input]
13294str.translate as unicode_translate
13295
13296 table: object
13297 Translation table, which must be a mapping of Unicode ordinals to
13298 Unicode ordinals, strings, or None.
13299 /
13300
13301Replace each character in the string using the given translation table.
13302
13303The table must implement lookup/indexing via __getitem__, for instance a
13304dictionary or list. If this operation raises LookupError, the character is
13305left untouched. Characters mapped to None are deleted.
13306[clinic start generated code]*/
13307
13308static PyObject *
13309unicode_translate(PyObject *self, PyObject *table)
13310/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13311{
13312 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13313}
13314
13315/*[clinic input]
13316str.upper as unicode_upper
13317
13318Return a copy of the string converted to uppercase.
13319[clinic start generated code]*/
13320
13321static PyObject *
13322unicode_upper_impl(PyObject *self)
13323/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13324{
13325 if (PyUnicode_READY(self) == -1)
13326 return NULL;
13327 if (PyUnicode_IS_ASCII(self))
13328 return ascii_upper_or_lower(self, 0);
13329 return case_operation(self, do_upper);
13330}
13331
13332/*[clinic input]
13333str.zfill as unicode_zfill
13334
13335 width: Py_ssize_t
13336 /
13337
13338Pad a numeric string with zeros on the left, to fill a field of the given width.
13339
13340The string is never truncated.
13341[clinic start generated code]*/
13342
13343static PyObject *
13344unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13345/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13346{
13347 Py_ssize_t fill;
13348 PyObject *u;
13349 int kind;
13350 void *data;
13351 Py_UCS4 chr;
13352
13353 if (PyUnicode_READY(self) == -1)
13354 return NULL;
13355
13356 if (PyUnicode_GET_LENGTH(self) >= width)
13357 return unicode_result_unchanged(self);
13358
13359 fill = width - PyUnicode_GET_LENGTH(self);
13360
13361 u = pad(self, fill, 0, '0');
13362
13363 if (u == NULL)
13364 return NULL;
13365
13366 kind = PyUnicode_KIND(u);
13367 data = PyUnicode_DATA(u);
13368 chr = PyUnicode_READ(kind, data, fill);
13369
13370 if (chr == '+' || chr == '-') {
13371 /* move sign to beginning of string */
13372 PyUnicode_WRITE(kind, data, 0, chr);
13373 PyUnicode_WRITE(kind, data, fill, '0');
13374 }
13375
13376 assert(_PyUnicode_CheckConsistency(u, 1));
13377 return u;
13378}
13379
13380#if 0
13381static PyObject *
13382unicode__decimal2ascii(PyObject *self)
13383{
13384 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13385}
13386#endif
13387
13388PyDoc_STRVAR(startswith__doc__,
13389 "S.startswith(prefix[, start[, end]]) -> bool\n\
13390\n\
13391Return True if S starts with the specified prefix, False otherwise.\n\
13392With optional start, test S beginning at that position.\n\
13393With optional end, stop comparing S at that position.\n\
13394prefix can also be a tuple of strings to try.");
13395
13396static PyObject *
13397unicode_startswith(PyObject *self,
13398 PyObject *args)
13399{
13400 PyObject *subobj;
13401 PyObject *substring;
13402 Py_ssize_t start = 0;
13403 Py_ssize_t end = PY_SSIZE_T_MAX;
13404 int result;
13405
13406 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13407 return NULL;
13408 if (PyTuple_Check(subobj)) {
13409 Py_ssize_t i;
13410 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13411 substring = PyTuple_GET_ITEM(subobj, i);
13412 if (!PyUnicode_Check(substring)) {
13413 PyErr_Format(PyExc_TypeError,
13414 "tuple for startswith must only contain str, "
13415 "not %.100s",
13416 Py_TYPE(substring)->tp_name);
13417 return NULL;
13418 }
13419 result = tailmatch(self, substring, start, end, -1);
13420 if (result == -1)
13421 return NULL;
13422 if (result) {
13423 Py_RETURN_TRUE;
13424 }
13425 }
13426 /* nothing matched */
13427 Py_RETURN_FALSE;
13428 }
13429 if (!PyUnicode_Check(subobj)) {
13430 PyErr_Format(PyExc_TypeError,
13431 "startswith first arg must be str or "
13432 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13433 return NULL;
13434 }
13435 result = tailmatch(self, subobj, start, end, -1);
13436 if (result == -1)
13437 return NULL;
13438 return PyBool_FromLong(result);
13439}
13440
13441
13442PyDoc_STRVAR(endswith__doc__,
13443 "S.endswith(suffix[, start[, end]]) -> bool\n\
13444\n\
13445Return True if S ends with the specified suffix, False otherwise.\n\
13446With optional start, test S beginning at that position.\n\
13447With optional end, stop comparing S at that position.\n\
13448suffix can also be a tuple of strings to try.");
13449
13450static PyObject *
13451unicode_endswith(PyObject *self,
13452 PyObject *args)
13453{
13454 PyObject *subobj;
13455 PyObject *substring;
13456 Py_ssize_t start = 0;
13457 Py_ssize_t end = PY_SSIZE_T_MAX;
13458 int result;
13459
13460 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13461 return NULL;
13462 if (PyTuple_Check(subobj)) {
13463 Py_ssize_t i;
13464 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13465 substring = PyTuple_GET_ITEM(subobj, i);
13466 if (!PyUnicode_Check(substring)) {
13467 PyErr_Format(PyExc_TypeError,
13468 "tuple for endswith must only contain str, "
13469 "not %.100s",
13470 Py_TYPE(substring)->tp_name);
13471 return NULL;
13472 }
13473 result = tailmatch(self, substring, start, end, +1);
13474 if (result == -1)
13475 return NULL;
13476 if (result) {
13477 Py_RETURN_TRUE;
13478 }
13479 }
13480 Py_RETURN_FALSE;
13481 }
13482 if (!PyUnicode_Check(subobj)) {
13483 PyErr_Format(PyExc_TypeError,
13484 "endswith first arg must be str or "
13485 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13486 return NULL;
13487 }
13488 result = tailmatch(self, subobj, start, end, +1);
13489 if (result == -1)
13490 return NULL;
13491 return PyBool_FromLong(result);
13492}
13493
13494static inline void
13495_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13496{
13497 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13498 writer->data = PyUnicode_DATA(writer->buffer);
13499
13500 if (!writer->readonly) {
13501 writer->kind = PyUnicode_KIND(writer->buffer);
13502 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13503 }
13504 else {
13505 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13506 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13507 writer->kind = PyUnicode_WCHAR_KIND;
13508 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13509
13510 /* Copy-on-write mode: set buffer size to 0 so
13511 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13512 * next write. */
13513 writer->size = 0;
13514 }
13515}
13516
13517void
13518_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13519{
13520 memset(writer, 0, sizeof(*writer));
13521
13522 /* ASCII is the bare minimum */
13523 writer->min_char = 127;
13524
13525 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13526 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13527 writer->kind = PyUnicode_WCHAR_KIND;
13528 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13529}
13530
13531int
13532_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13533 Py_ssize_t length, Py_UCS4 maxchar)
13534{
13535 Py_ssize_t newlen;
13536 PyObject *newbuffer;
13537
13538 assert(maxchar <= MAX_UNICODE);
13539
13540 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13541 assert((maxchar > writer->maxchar && length >= 0)
13542 || length > 0);
13543
13544 if (length > PY_SSIZE_T_MAX - writer->pos) {
13545 PyErr_NoMemory();
13546 return -1;
13547 }
13548 newlen = writer->pos + length;
13549
13550 maxchar = Py_MAX(maxchar, writer->min_char);
13551
13552 if (writer->buffer == NULL) {
13553 assert(!writer->readonly);
13554 if (writer->overallocate
13555 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13556 /* overallocate to limit the number of realloc() */
13557 newlen += newlen / OVERALLOCATE_FACTOR;
13558 }
13559 if (newlen < writer->min_length)
13560 newlen = writer->min_length;
13561
13562 writer->buffer = PyUnicode_New(newlen, maxchar);
13563 if (writer->buffer == NULL)
13564 return -1;
13565 }
13566 else if (newlen > writer->size) {
13567 if (writer->overallocate
13568 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13569 /* overallocate to limit the number of realloc() */
13570 newlen += newlen / OVERALLOCATE_FACTOR;
13571 }
13572 if (newlen < writer->min_length)
13573 newlen = writer->min_length;
13574
13575 if (maxchar > writer->maxchar || writer->readonly) {
13576 /* resize + widen */
13577 maxchar = Py_MAX(maxchar, writer->maxchar);
13578 newbuffer = PyUnicode_New(newlen, maxchar);
13579 if (newbuffer == NULL)
13580 return -1;
13581 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13582 writer->buffer, 0, writer->pos);
13583 Py_DECREF(writer->buffer);
13584 writer->readonly = 0;
13585 }
13586 else {
13587 newbuffer = resize_compact(writer->buffer, newlen);
13588 if (newbuffer == NULL)
13589 return -1;
13590 }
13591 writer->buffer = newbuffer;
13592 }
13593 else if (maxchar > writer->maxchar) {
13594 assert(!writer->readonly);
13595 newbuffer = PyUnicode_New(writer->size, maxchar);
13596 if (newbuffer == NULL)
13597 return -1;
13598 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13599 writer->buffer, 0, writer->pos);
13600 Py_SETREF(writer->buffer, newbuffer);
13601 }
13602 _PyUnicodeWriter_Update(writer);
13603 return 0;
13604
13605#undef OVERALLOCATE_FACTOR
13606}
13607
13608int
13609_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13610 enum PyUnicode_Kind kind)
13611{
13612 Py_UCS4 maxchar;
13613
13614 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13615 assert(writer->kind < kind);
13616
13617 switch (kind)
13618 {
13619 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13620 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13621 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13622 default:
13623 assert(0 && "invalid kind");
13624 return -1;
13625 }
13626
13627 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13628}
13629
13630static inline int
13631_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13632{
13633 assert(ch <= MAX_UNICODE);
13634 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13635 return -1;
13636 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13637 writer->pos++;
13638 return 0;
13639}
13640
13641int
13642_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13643{
13644 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13645}
13646
13647int
13648_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13649{
13650 Py_UCS4 maxchar;
13651 Py_ssize_t len;
13652
13653 if (PyUnicode_READY(str) == -1)
13654 return -1;
13655 len = PyUnicode_GET_LENGTH(str);
13656 if (len == 0)
13657 return 0;
13658 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13659 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13660 if (writer->buffer == NULL && !writer->overallocate) {
13661 assert(_PyUnicode_CheckConsistency(str, 1));
13662 writer->readonly = 1;
13663 Py_INCREF(str);
13664 writer->buffer = str;
13665 _PyUnicodeWriter_Update(writer);
13666 writer->pos += len;
13667 return 0;
13668 }
13669 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13670 return -1;
13671 }
13672 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13673 str, 0, len);
13674 writer->pos += len;
13675 return 0;
13676}
13677
13678int
13679_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13680 Py_ssize_t start, Py_ssize_t end)
13681{
13682 Py_UCS4 maxchar;
13683 Py_ssize_t len;
13684
13685 if (PyUnicode_READY(str) == -1)
13686 return -1;
13687
13688 assert(0 <= start);
13689 assert(end <= PyUnicode_GET_LENGTH(str));
13690 assert(start <= end);
13691
13692 if (end == 0)
13693 return 0;
13694
13695 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13696 return _PyUnicodeWriter_WriteStr(writer, str);
13697
13698 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13699 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13700 else
13701 maxchar = writer->maxchar;
13702 len = end - start;
13703
13704 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13705 return -1;
13706
13707 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13708 str, start, len);
13709 writer->pos += len;
13710 return 0;
13711}
13712
13713int
13714_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13715 const char *ascii, Py_ssize_t len)
13716{
13717 if (len == -1)
13718 len = strlen(ascii);
13719
13720 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13721
13722 if (writer->buffer == NULL && !writer->overallocate) {
13723 PyObject *str;
13724
13725 str = _PyUnicode_FromASCII(ascii, len);
13726 if (str == NULL)
13727 return -1;
13728
13729 writer->readonly = 1;
13730 writer->buffer = str;
13731 _PyUnicodeWriter_Update(writer);
13732 writer->pos += len;
13733 return 0;
13734 }
13735
13736 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13737 return -1;
13738
13739 switch (writer->kind)
13740 {
13741 case PyUnicode_1BYTE_KIND:
13742 {
13743 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13744 Py_UCS1 *data = writer->data;
13745
13746 memcpy(data + writer->pos, str, len);
13747 break;
13748 }
13749 case PyUnicode_2BYTE_KIND:
13750 {
13751 _PyUnicode_CONVERT_BYTES(
13752 Py_UCS1, Py_UCS2,
13753 ascii, ascii + len,
13754 (Py_UCS2 *)writer->data + writer->pos);
13755 break;
13756 }
13757 case PyUnicode_4BYTE_KIND:
13758 {
13759 _PyUnicode_CONVERT_BYTES(
13760 Py_UCS1, Py_UCS4,
13761 ascii, ascii + len,
13762 (Py_UCS4 *)writer->data + writer->pos);
13763 break;
13764 }
13765 default:
13766 assert(0);
13767 }
13768
13769 writer->pos += len;
13770 return 0;
13771}
13772
13773int
13774_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13775 const char *str, Py_ssize_t len)
13776{
13777 Py_UCS4 maxchar;
13778
13779 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13780 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13781 return -1;
13782 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13783 writer->pos += len;
13784 return 0;
13785}
13786
13787PyObject *
13788_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13789{
13790 PyObject *str;
13791
13792 if (writer->pos == 0) {
13793 Py_CLEAR(writer->buffer);
13794 _Py_RETURN_UNICODE_EMPTY();
13795 }
13796
13797 str = writer->buffer;
13798 writer->buffer = NULL;
13799
13800 if (writer->readonly) {
13801 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13802 return str;
13803 }
13804
13805 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13806 PyObject *str2;
13807 str2 = resize_compact(str, writer->pos);
13808 if (str2 == NULL) {
13809 Py_DECREF(str);
13810 return NULL;
13811 }
13812 str = str2;
13813 }
13814
13815 assert(_PyUnicode_CheckConsistency(str, 1));
13816 return unicode_result_ready(str);
13817}
13818
13819void
13820_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13821{
13822 Py_CLEAR(writer->buffer);
13823}
13824
13825#include "stringlib/unicode_format.h"
13826
13827PyDoc_STRVAR(format__doc__,
13828 "S.format(*args, **kwargs) -> str\n\
13829\n\
13830Return a formatted version of S, using substitutions from args and kwargs.\n\
13831The substitutions are identified by braces ('{' and '}').");
13832
13833PyDoc_STRVAR(format_map__doc__,
13834 "S.format_map(mapping) -> str\n\
13835\n\
13836Return a formatted version of S, using substitutions from mapping.\n\
13837The substitutions are identified by braces ('{' and '}').");
13838
13839/*[clinic input]
13840str.__format__ as unicode___format__
13841
13842 format_spec: unicode
13843 /
13844
13845Return a formatted version of the string as described by format_spec.
13846[clinic start generated code]*/
13847
13848static PyObject *
13849unicode___format___impl(PyObject *self, PyObject *format_spec)
13850/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13851{
13852 _PyUnicodeWriter writer;
13853 int ret;
13854
13855 if (PyUnicode_READY(self) == -1)
13856 return NULL;
13857 _PyUnicodeWriter_Init(&writer);
13858 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13859 self, format_spec, 0,
13860 PyUnicode_GET_LENGTH(format_spec));
13861 if (ret == -1) {
13862 _PyUnicodeWriter_Dealloc(&writer);
13863 return NULL;
13864 }
13865 return _PyUnicodeWriter_Finish(&writer);
13866}
13867
13868/*[clinic input]
13869str.__sizeof__ as unicode_sizeof
13870
13871Return the size of the string in memory, in bytes.
13872[clinic start generated code]*/
13873
13874static PyObject *
13875unicode_sizeof_impl(PyObject *self)
13876/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13877{
13878 Py_ssize_t size;
13879
13880 /* If it's a compact object, account for base structure +
13881 character data. */
13882 if (PyUnicode_IS_COMPACT_ASCII(self))
13883 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13884 else if (PyUnicode_IS_COMPACT(self))
13885 size = sizeof(PyCompactUnicodeObject) +
13886 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13887 else {
13888 /* If it is a two-block object, account for base object, and
13889 for character block if present. */
13890 size = sizeof(PyUnicodeObject);
13891 if (_PyUnicode_DATA_ANY(self))
13892 size += (PyUnicode_GET_LENGTH(self) + 1) *
13893 PyUnicode_KIND(self);
13894 }
13895 /* If the wstr pointer is present, account for it unless it is shared
13896 with the data pointer. Check if the data is not shared. */
13897 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13898 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13899 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13900 size += PyUnicode_UTF8_LENGTH(self) + 1;
13901
13902 return PyLong_FromSsize_t(size);
13903}
13904
13905static PyObject *
13906unicode_getnewargs(PyObject *v)
13907{
13908 PyObject *copy = _PyUnicode_Copy(v);
13909 if (!copy)
13910 return NULL;
13911 return Py_BuildValue("(N)", copy);
13912}
13913
13914static PyMethodDef unicode_methods[] = {
13915 UNICODE_ENCODE_METHODDEF
13916 UNICODE_REPLACE_METHODDEF
13917 UNICODE_SPLIT_METHODDEF
13918 UNICODE_RSPLIT_METHODDEF
13919 UNICODE_JOIN_METHODDEF
13920 UNICODE_CAPITALIZE_METHODDEF
13921 UNICODE_CASEFOLD_METHODDEF
13922 UNICODE_TITLE_METHODDEF
13923 UNICODE_CENTER_METHODDEF
13924 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13925 UNICODE_EXPANDTABS_METHODDEF
13926 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13927 UNICODE_PARTITION_METHODDEF
13928 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13929 UNICODE_LJUST_METHODDEF
13930 UNICODE_LOWER_METHODDEF
13931 UNICODE_LSTRIP_METHODDEF
13932 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13933 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13934 UNICODE_RJUST_METHODDEF
13935 UNICODE_RSTRIP_METHODDEF
13936 UNICODE_RPARTITION_METHODDEF
13937 UNICODE_SPLITLINES_METHODDEF
13938 UNICODE_STRIP_METHODDEF
13939 UNICODE_SWAPCASE_METHODDEF
13940 UNICODE_TRANSLATE_METHODDEF
13941 UNICODE_UPPER_METHODDEF
13942 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13943 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13944 UNICODE_ISLOWER_METHODDEF
13945 UNICODE_ISUPPER_METHODDEF
13946 UNICODE_ISTITLE_METHODDEF
13947 UNICODE_ISSPACE_METHODDEF
13948 UNICODE_ISDECIMAL_METHODDEF
13949 UNICODE_ISDIGIT_METHODDEF
13950 UNICODE_ISNUMERIC_METHODDEF
13951 UNICODE_ISALPHA_METHODDEF
13952 UNICODE_ISALNUM_METHODDEF
13953 UNICODE_ISIDENTIFIER_METHODDEF
13954 UNICODE_ISPRINTABLE_METHODDEF
13955 UNICODE_ZFILL_METHODDEF
13956 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13957 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13958 UNICODE___FORMAT___METHODDEF
13959 UNICODE_MAKETRANS_METHODDEF
13960 UNICODE_SIZEOF_METHODDEF
13961#if 0
13962 /* These methods are just used for debugging the implementation. */
13963 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13964#endif
13965
13966 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
13967 {NULL, NULL}
13968};
13969
13970static PyObject *
13971unicode_mod(PyObject *v, PyObject *w)
13972{
13973 if (!PyUnicode_Check(v))
13974 Py_RETURN_NOTIMPLEMENTED;
13975 return PyUnicode_Format(v, w);
13976}
13977
13978static PyNumberMethods unicode_as_number = {
13979 0, /*nb_add*/
13980 0, /*nb_subtract*/
13981 0, /*nb_multiply*/
13982 unicode_mod, /*nb_remainder*/
13983};
13984
13985static PySequenceMethods unicode_as_sequence = {
13986 (lenfunc) unicode_length, /* sq_length */
13987 PyUnicode_Concat, /* sq_concat */
13988 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13989 (ssizeargfunc) unicode_getitem, /* sq_item */
13990 0, /* sq_slice */
13991 0, /* sq_ass_item */
13992 0, /* sq_ass_slice */
13993 PyUnicode_Contains, /* sq_contains */
13994};
13995
13996static PyObject*
13997unicode_subscript(PyObject* self, PyObject* item)
13998{
13999 if (PyUnicode_READY(self) == -1)
14000 return NULL;
14001
14002 if (PyIndex_Check(item)) {
14003 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14004 if (i == -1 && PyErr_Occurred())
14005 return NULL;
14006 if (i < 0)
14007 i += PyUnicode_GET_LENGTH(self);
14008 return unicode_getitem(self, i);
14009 } else if (PySlice_Check(item)) {
14010 Py_ssize_t start, stop, step, slicelength, cur, i;
14011 PyObject *result;
14012 void *src_data, *dest_data;
14013 int src_kind, dest_kind;
14014 Py_UCS4 ch, max_char, kind_limit;
14015
14016 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14017 return NULL;
14018 }
14019 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14020 &start, &stop, step);
14021
14022 if (slicelength <= 0) {
14023 _Py_RETURN_UNICODE_EMPTY();
14024 } else if (start == 0 && step == 1 &&
14025 slicelength == PyUnicode_GET_LENGTH(self)) {
14026 return unicode_result_unchanged(self);
14027 } else if (step == 1) {
14028 return PyUnicode_Substring(self,
14029 start, start + slicelength);
14030 }
14031 /* General case */
14032 src_kind = PyUnicode_KIND(self);
14033 src_data = PyUnicode_DATA(self);
14034 if (!PyUnicode_IS_ASCII(self)) {
14035 kind_limit = kind_maxchar_limit(src_kind);
14036 max_char = 0;
14037 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14038 ch = PyUnicode_READ(src_kind, src_data, cur);
14039 if (ch > max_char) {
14040 max_char = ch;
14041 if (max_char >= kind_limit)
14042 break;
14043 }
14044 }
14045 }
14046 else
14047 max_char = 127;
14048 result = PyUnicode_New(slicelength, max_char);
14049 if (result == NULL)
14050 return NULL;
14051 dest_kind = PyUnicode_KIND(result);
14052 dest_data = PyUnicode_DATA(result);
14053
14054 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14055 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14056 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14057 }
14058 assert(_PyUnicode_CheckConsistency(result, 1));
14059 return result;
14060 } else {
14061 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14062 return NULL;
14063 }
14064}
14065
14066static PyMappingMethods unicode_as_mapping = {
14067 (lenfunc)unicode_length, /* mp_length */
14068 (binaryfunc)unicode_subscript, /* mp_subscript */
14069 (objobjargproc)0, /* mp_ass_subscript */
14070};
14071
14072
14073/* Helpers for PyUnicode_Format() */
14074
14075struct unicode_formatter_t {
14076 PyObject *args;
14077 int args_owned;
14078 Py_ssize_t arglen, argidx;
14079 PyObject *dict;
14080
14081 enum PyUnicode_Kind fmtkind;
14082 Py_ssize_t fmtcnt, fmtpos;
14083 void *fmtdata;
14084 PyObject *fmtstr;
14085
14086 _PyUnicodeWriter writer;
14087};
14088
14089struct unicode_format_arg_t {
14090 Py_UCS4 ch;
14091 int flags;
14092 Py_ssize_t width;
14093 int prec;
14094 int sign;
14095};
14096
14097static PyObject *
14098unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14099{
14100 Py_ssize_t argidx = ctx->argidx;
14101
14102 if (argidx < ctx->arglen) {
14103 ctx->argidx++;
14104 if (ctx->arglen < 0)
14105 return ctx->args;
14106 else
14107 return PyTuple_GetItem(ctx->args, argidx);
14108 }
14109 PyErr_SetString(PyExc_TypeError,
14110 "not enough arguments for format string");
14111 return NULL;
14112}
14113
14114/* Returns a new reference to a PyUnicode object, or NULL on failure. */
14115
14116/* Format a float into the writer if the writer is not NULL, or into *p_output
14117 otherwise.
14118
14119 Return 0 on success, raise an exception and return -1 on error. */
14120static int
14121formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14122 PyObject **p_output,
14123 _PyUnicodeWriter *writer)
14124{
14125 char *p;
14126 double x;
14127 Py_ssize_t len;
14128 int prec;
14129 int dtoa_flags;
14130
14131 x = PyFloat_AsDouble(v);
14132 if (x == -1.0 && PyErr_Occurred())
14133 return -1;
14134
14135 prec = arg->prec;
14136 if (prec < 0)
14137 prec = 6;
14138
14139 if (arg->flags & F_ALT)
14140 dtoa_flags = Py_DTSF_ALT;
14141 else
14142 dtoa_flags = 0;
14143 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14144 if (p == NULL)
14145 return -1;
14146 len = strlen(p);
14147 if (writer) {
14148 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14149 PyMem_Free(p);
14150 return -1;
14151 }
14152 }
14153 else
14154 *p_output = _PyUnicode_FromASCII(p, len);
14155 PyMem_Free(p);
14156 return 0;
14157}
14158
14159/* formatlong() emulates the format codes d, u, o, x and X, and
14160 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14161 * Python's regular ints.
14162 * Return value: a new PyUnicodeObject*, or NULL if error.
14163 * The output string is of the form
14164 * "-"? ("0x" | "0X")? digit+
14165 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14166 * set in flags. The case of hex digits will be correct,
14167 * There will be at least prec digits, zero-filled on the left if
14168 * necessary to get that many.
14169 * val object to be converted
14170 * flags bitmask of format flags; only F_ALT is looked at
14171 * prec minimum number of digits; 0-fill on left if needed
14172 * type a character in [duoxX]; u acts the same as d
14173 *
14174 * CAUTION: o, x and X conversions on regular ints can never
14175 * produce a '-' sign, but can for Python's unbounded ints.
14176 */
14177PyObject *
14178_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14179{
14180 PyObject *result = NULL;
14181 char *buf;
14182 Py_ssize_t i;
14183 int sign; /* 1 if '-', else 0 */
14184 int len; /* number of characters */
14185 Py_ssize_t llen;
14186 int numdigits; /* len == numnondigits + numdigits */
14187 int numnondigits = 0;
14188
14189 /* Avoid exceeding SSIZE_T_MAX */
14190 if (prec > INT_MAX-3) {
14191 PyErr_SetString(PyExc_OverflowError,
14192 "precision too large");
14193 return NULL;
14194 }
14195
14196 assert(PyLong_Check(val));
14197
14198 switch (type) {
14199 default:
14200 assert(!"'type' not in [diuoxX]");
14201 case 'd':
14202 case 'i':
14203 case 'u':
14204 /* int and int subclasses should print numerically when a numeric */
14205 /* format code is used (see issue18780) */
14206 result = PyNumber_ToBase(val, 10);
14207 break;
14208 case 'o':
14209 numnondigits = 2;
14210 result = PyNumber_ToBase(val, 8);
14211 break;
14212 case 'x':
14213 case 'X':
14214 numnondigits = 2;
14215 result = PyNumber_ToBase(val, 16);
14216 break;
14217 }
14218 if (!result)
14219 return NULL;
14220
14221 assert(unicode_modifiable(result));
14222 assert(PyUnicode_IS_READY(result));
14223 assert(PyUnicode_IS_ASCII(result));
14224
14225 /* To modify the string in-place, there can only be one reference. */
14226 if (Py_REFCNT(result) != 1) {
14227 Py_DECREF(result);
14228 PyErr_BadInternalCall();
14229 return NULL;
14230 }
14231 buf = PyUnicode_DATA(result);
14232 llen = PyUnicode_GET_LENGTH(result);
14233 if (llen > INT_MAX) {
14234 Py_DECREF(result);
14235 PyErr_SetString(PyExc_ValueError,
14236 "string too large in _PyUnicode_FormatLong");
14237 return NULL;
14238 }
14239 len = (int)llen;
14240 sign = buf[0] == '-';
14241 numnondigits += sign;
14242 numdigits = len - numnondigits;
14243 assert(numdigits > 0);
14244
14245 /* Get rid of base marker unless F_ALT */
14246 if (((alt) == 0 &&
14247 (type == 'o' || type == 'x' || type == 'X'))) {
14248 assert(buf[sign] == '0');
14249 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14250 buf[sign+1] == 'o');
14251 numnondigits -= 2;
14252 buf += 2;
14253 len -= 2;
14254 if (sign)
14255 buf[0] = '-';
14256 assert(len == numnondigits + numdigits);
14257 assert(numdigits > 0);
14258 }
14259
14260 /* Fill with leading zeroes to meet minimum width. */
14261 if (prec > numdigits) {
14262 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14263 numnondigits + prec);
14264 char *b1;
14265 if (!r1) {
14266 Py_DECREF(result);
14267 return NULL;
14268 }
14269 b1 = PyBytes_AS_STRING(r1);
14270 for (i = 0; i < numnondigits; ++i)
14271 *b1++ = *buf++;
14272 for (i = 0; i < prec - numdigits; i++)
14273 *b1++ = '0';
14274 for (i = 0; i < numdigits; i++)
14275 *b1++ = *buf++;
14276 *b1 = '\0';
14277 Py_DECREF(result);
14278 result = r1;
14279 buf = PyBytes_AS_STRING(result);
14280 len = numnondigits + prec;
14281 }
14282
14283 /* Fix up case for hex conversions. */
14284 if (type == 'X') {
14285 /* Need to convert all lower case letters to upper case.
14286 and need to convert 0x to 0X (and -0x to -0X). */
14287 for (i = 0; i < len; i++)
14288 if (buf[i] >= 'a' && buf[i] <= 'x')
14289 buf[i] -= 'a'-'A';
14290 }
14291 if (!PyUnicode_Check(result)
14292 || buf != PyUnicode_DATA(result)) {
14293 PyObject *unicode;
14294 unicode = _PyUnicode_FromASCII(buf, len);
14295 Py_DECREF(result);
14296 result = unicode;
14297 }
14298 else if (len != PyUnicode_GET_LENGTH(result)) {
14299 if (PyUnicode_Resize(&result, len) < 0)
14300 Py_CLEAR(result);
14301 }
14302 return result;
14303}
14304
14305/* Format an integer or a float as an integer.
14306 * Return 1 if the number has been formatted into the writer,
14307 * 0 if the number has been formatted into *p_output
14308 * -1 and raise an exception on error */
14309static int
14310mainformatlong(PyObject *v,
14311 struct unicode_format_arg_t *arg,
14312 PyObject **p_output,
14313 _PyUnicodeWriter *writer)
14314{
14315 PyObject *iobj, *res;
14316 char type = (char)arg->ch;
14317
14318 if (!PyNumber_Check(v))
14319 goto wrongtype;
14320
14321 /* make sure number is a type of integer for o, x, and X */
14322 if (!PyLong_Check(v)) {
14323 if (type == 'o' || type == 'x' || type == 'X') {
14324 iobj = PyNumber_Index(v);
14325 if (iobj == NULL) {
14326 if (PyErr_ExceptionMatches(PyExc_TypeError))
14327 goto wrongtype;
14328 return -1;
14329 }
14330 }
14331 else {
14332 iobj = PyNumber_Long(v);
14333 if (iobj == NULL ) {
14334 if (PyErr_ExceptionMatches(PyExc_TypeError))
14335 goto wrongtype;
14336 return -1;
14337 }
14338 }
14339 assert(PyLong_Check(iobj));
14340 }
14341 else {
14342 iobj = v;
14343 Py_INCREF(iobj);
14344 }
14345
14346 if (PyLong_CheckExact(v)
14347 && arg->width == -1 && arg->prec == -1
14348 && !(arg->flags & (F_SIGN | F_BLANK))
14349 && type != 'X')
14350 {
14351 /* Fast path */
14352 int alternate = arg->flags & F_ALT;
14353 int base;
14354
14355 switch(type)
14356 {
14357 default:
14358 assert(0 && "'type' not in [diuoxX]");
14359 case 'd':
14360 case 'i':
14361 case 'u':
14362 base = 10;
14363 break;
14364 case 'o':
14365 base = 8;
14366 break;
14367 case 'x':
14368 case 'X':
14369 base = 16;
14370 break;
14371 }
14372
14373 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14374 Py_DECREF(iobj);
14375 return -1;
14376 }
14377 Py_DECREF(iobj);
14378 return 1;
14379 }
14380
14381 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14382 Py_DECREF(iobj);
14383 if (res == NULL)
14384 return -1;
14385 *p_output = res;
14386 return 0;
14387
14388wrongtype:
14389 switch(type)
14390 {
14391 case 'o':
14392 case 'x':
14393 case 'X':
14394 PyErr_Format(PyExc_TypeError,
14395 "%%%c format: an integer is required, "
14396 "not %.200s",
14397 type, Py_TYPE(v)->tp_name);
14398 break;
14399 default:
14400 PyErr_Format(PyExc_TypeError,
14401 "%%%c format: a number is required, "
14402 "not %.200s",
14403 type, Py_TYPE(v)->tp_name);
14404 break;
14405 }
14406 return -1;
14407}
14408
14409static Py_UCS4
14410formatchar(PyObject *v)
14411{
14412 /* presume that the buffer is at least 3 characters long */
14413 if (PyUnicode_Check(v)) {
14414 if (PyUnicode_GET_LENGTH(v) == 1) {
14415 return PyUnicode_READ_CHAR(v, 0);
14416 }
14417 goto onError;
14418 }
14419 else {
14420 PyObject *iobj;
14421 long x;
14422 /* make sure number is a type of integer */
14423 if (!PyLong_Check(v)) {
14424 iobj = PyNumber_Index(v);
14425 if (iobj == NULL) {
14426 goto onError;
14427 }
14428 x = PyLong_AsLong(iobj);
14429 Py_DECREF(iobj);
14430 }
14431 else {
14432 x = PyLong_AsLong(v);
14433 }
14434 if (x == -1 && PyErr_Occurred())
14435 goto onError;
14436
14437 if (x < 0 || x > MAX_UNICODE) {
14438 PyErr_SetString(PyExc_OverflowError,
14439 "%c arg not in range(0x110000)");
14440 return (Py_UCS4) -1;
14441 }
14442
14443 return (Py_UCS4) x;
14444 }
14445
14446 onError:
14447 PyErr_SetString(PyExc_TypeError,
14448 "%c requires int or char");
14449 return (Py_UCS4) -1;
14450}
14451
14452/* Parse options of an argument: flags, width, precision.
14453 Handle also "%(name)" syntax.
14454
14455 Return 0 if the argument has been formatted into arg->str.
14456 Return 1 if the argument has been written into ctx->writer,
14457 Raise an exception and return -1 on error. */
14458static int
14459unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14460 struct unicode_format_arg_t *arg)
14461{
14462#define FORMAT_READ(ctx) \
14463 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14464
14465 PyObject *v;
14466
14467 if (arg->ch == '(') {
14468 /* Get argument value from a dictionary. Example: "%(name)s". */
14469 Py_ssize_t keystart;
14470 Py_ssize_t keylen;
14471 PyObject *key;
14472 int pcount = 1;
14473
14474 if (ctx->dict == NULL) {
14475 PyErr_SetString(PyExc_TypeError,
14476 "format requires a mapping");
14477 return -1;
14478 }
14479 ++ctx->fmtpos;
14480 --ctx->fmtcnt;
14481 keystart = ctx->fmtpos;
14482 /* Skip over balanced parentheses */
14483 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14484 arg->ch = FORMAT_READ(ctx);
14485 if (arg->ch == ')')
14486 --pcount;
14487 else if (arg->ch == '(')
14488 ++pcount;
14489 ctx->fmtpos++;
14490 }
14491 keylen = ctx->fmtpos - keystart - 1;
14492 if (ctx->fmtcnt < 0 || pcount > 0) {
14493 PyErr_SetString(PyExc_ValueError,
14494 "incomplete format key");
14495 return -1;
14496 }
14497 key = PyUnicode_Substring(ctx->fmtstr,
14498 keystart, keystart + keylen);
14499 if (key == NULL)
14500 return -1;
14501 if (ctx->args_owned) {
14502 ctx->args_owned = 0;
14503 Py_DECREF(ctx->args);
14504 }
14505 ctx->args = PyObject_GetItem(ctx->dict, key);
14506 Py_DECREF(key);
14507 if (ctx->args == NULL)
14508 return -1;
14509 ctx->args_owned = 1;
14510 ctx->arglen = -1;
14511 ctx->argidx = -2;
14512 }
14513
14514 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14515 while (--ctx->fmtcnt >= 0) {
14516 arg->ch = FORMAT_READ(ctx);
14517 ctx->fmtpos++;
14518 switch (arg->ch) {
14519 case '-': arg->flags |= F_LJUST; continue;
14520 case '+': arg->flags |= F_SIGN; continue;
14521 case ' ': arg->flags |= F_BLANK; continue;
14522 case '#': arg->flags |= F_ALT; continue;
14523 case '0': arg->flags |= F_ZERO; continue;
14524 }
14525 break;
14526 }
14527
14528 /* Parse width. Example: "%10s" => width=10 */
14529 if (arg->ch == '*') {
14530 v = unicode_format_getnextarg(ctx);
14531 if (v == NULL)
14532 return -1;
14533 if (!PyLong_Check(v)) {
14534 PyErr_SetString(PyExc_TypeError,
14535 "* wants int");
14536 return -1;
14537 }
14538 arg->width = PyLong_AsSsize_t(v);
14539 if (arg->width == -1 && PyErr_Occurred())
14540 return -1;
14541 if (arg->width < 0) {
14542 arg->flags |= F_LJUST;
14543 arg->width = -arg->width;
14544 }
14545 if (--ctx->fmtcnt >= 0) {
14546 arg->ch = FORMAT_READ(ctx);
14547 ctx->fmtpos++;
14548 }
14549 }
14550 else if (arg->ch >= '0' && arg->ch <= '9') {
14551 arg->width = arg->ch - '0';
14552 while (--ctx->fmtcnt >= 0) {
14553 arg->ch = FORMAT_READ(ctx);
14554 ctx->fmtpos++;
14555 if (arg->ch < '0' || arg->ch > '9')
14556 break;
14557 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14558 mixing signed and unsigned comparison. Since arg->ch is between
14559 '0' and '9', casting to int is safe. */
14560 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14561 PyErr_SetString(PyExc_ValueError,
14562 "width too big");
14563 return -1;
14564 }
14565 arg->width = arg->width*10 + (arg->ch - '0');
14566 }
14567 }
14568
14569 /* Parse precision. Example: "%.3f" => prec=3 */
14570 if (arg->ch == '.') {
14571 arg->prec = 0;
14572 if (--ctx->fmtcnt >= 0) {
14573 arg->ch = FORMAT_READ(ctx);
14574 ctx->fmtpos++;
14575 }
14576 if (arg->ch == '*') {
14577 v = unicode_format_getnextarg(ctx);
14578 if (v == NULL)
14579 return -1;
14580 if (!PyLong_Check(v)) {
14581 PyErr_SetString(PyExc_TypeError,
14582 "* wants int");
14583 return -1;
14584 }
14585 arg->prec = _PyLong_AsInt(v);
14586 if (arg->prec == -1 && PyErr_Occurred())
14587 return -1;
14588 if (arg->prec < 0)
14589 arg->prec = 0;
14590 if (--ctx->fmtcnt >= 0) {
14591 arg->ch = FORMAT_READ(ctx);
14592 ctx->fmtpos++;
14593 }
14594 }
14595 else if (arg->ch >= '0' && arg->ch <= '9') {
14596 arg->prec = arg->ch - '0';
14597 while (--ctx->fmtcnt >= 0) {
14598 arg->ch = FORMAT_READ(ctx);
14599 ctx->fmtpos++;
14600 if (arg->ch < '0' || arg->ch > '9')
14601 break;
14602 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14603 PyErr_SetString(PyExc_ValueError,
14604 "precision too big");
14605 return -1;
14606 }
14607 arg->prec = arg->prec*10 + (arg->ch - '0');
14608 }
14609 }
14610 }
14611
14612 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14613 if (ctx->fmtcnt >= 0) {
14614 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14615 if (--ctx->fmtcnt >= 0) {
14616 arg->ch = FORMAT_READ(ctx);
14617 ctx->fmtpos++;
14618 }
14619 }
14620 }
14621 if (ctx->fmtcnt < 0) {
14622 PyErr_SetString(PyExc_ValueError,
14623 "incomplete format");
14624 return -1;
14625 }
14626 return 0;
14627
14628#undef FORMAT_READ
14629}
14630
14631/* Format one argument. Supported conversion specifiers:
14632
14633 - "s", "r", "a": any type
14634 - "i", "d", "u": int or float
14635 - "o", "x", "X": int
14636 - "e", "E", "f", "F", "g", "G": float
14637 - "c": int or str (1 character)
14638
14639 When possible, the output is written directly into the Unicode writer
14640 (ctx->writer). A string is created when padding is required.
14641
14642 Return 0 if the argument has been formatted into *p_str,
14643 1 if the argument has been written into ctx->writer,
14644 -1 on error. */
14645static int
14646unicode_format_arg_format(struct unicode_formatter_t *ctx,
14647 struct unicode_format_arg_t *arg,
14648 PyObject **p_str)
14649{
14650 PyObject *v;
14651 _PyUnicodeWriter *writer = &ctx->writer;
14652
14653 if (ctx->fmtcnt == 0)
14654 ctx->writer.overallocate = 0;
14655
14656 v = unicode_format_getnextarg(ctx);
14657 if (v == NULL)
14658 return -1;
14659
14660
14661 switch (arg->ch) {
14662 case 's':
14663 case 'r':
14664 case 'a':
14665 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14666 /* Fast path */
14667 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14668 return -1;
14669 return 1;
14670 }
14671
14672 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14673 *p_str = v;
14674 Py_INCREF(*p_str);
14675 }
14676 else {
14677 if (arg->ch == 's')
14678 *p_str = PyObject_Str(v);
14679 else if (arg->ch == 'r')
14680 *p_str = PyObject_Repr(v);
14681 else
14682 *p_str = PyObject_ASCII(v);
14683 }
14684 break;
14685
14686 case 'i':
14687 case 'd':
14688 case 'u':
14689 case 'o':
14690 case 'x':
14691 case 'X':
14692 {
14693 int ret = mainformatlong(v, arg, p_str, writer);
14694 if (ret != 0)
14695 return ret;
14696 arg->sign = 1;
14697 break;
14698 }
14699
14700 case 'e':
14701 case 'E':
14702 case 'f':
14703 case 'F':
14704 case 'g':
14705 case 'G':
14706 if (arg->width == -1 && arg->prec == -1
14707 && !(arg->flags & (F_SIGN | F_BLANK)))
14708 {
14709 /* Fast path */
14710 if (formatfloat(v, arg, NULL, writer) == -1)
14711 return -1;
14712 return 1;
14713 }
14714
14715 arg->sign = 1;
14716 if (formatfloat(v, arg, p_str, NULL) == -1)
14717 return -1;
14718 break;
14719
14720 case 'c':
14721 {
14722 Py_UCS4 ch = formatchar(v);
14723 if (ch == (Py_UCS4) -1)
14724 return -1;
14725 if (arg->width == -1 && arg->prec == -1) {
14726 /* Fast path */
14727 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14728 return -1;
14729 return 1;
14730 }
14731 *p_str = PyUnicode_FromOrdinal(ch);
14732 break;
14733 }
14734
14735 default:
14736 PyErr_Format(PyExc_ValueError,
14737 "unsupported format character '%c' (0x%x) "
14738 "at index %zd",
14739 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14740 (int)arg->ch,
14741 ctx->fmtpos - 1);
14742 return -1;
14743 }
14744 if (*p_str == NULL)
14745 return -1;
14746 assert (PyUnicode_Check(*p_str));
14747 return 0;
14748}
14749
14750static int
14751unicode_format_arg_output(struct unicode_formatter_t *ctx,
14752 struct unicode_format_arg_t *arg,
14753 PyObject *str)
14754{
14755 Py_ssize_t len;
14756 enum PyUnicode_Kind kind;
14757 void *pbuf;
14758 Py_ssize_t pindex;
14759 Py_UCS4 signchar;
14760 Py_ssize_t buflen;
14761 Py_UCS4 maxchar;
14762 Py_ssize_t sublen;
14763 _PyUnicodeWriter *writer = &ctx->writer;
14764 Py_UCS4 fill;
14765
14766 fill = ' ';
14767 if (arg->sign && arg->flags & F_ZERO)
14768 fill = '0';
14769
14770 if (PyUnicode_READY(str) == -1)
14771 return -1;
14772
14773 len = PyUnicode_GET_LENGTH(str);
14774 if ((arg->width == -1 || arg->width <= len)
14775 && (arg->prec == -1 || arg->prec >= len)
14776 && !(arg->flags & (F_SIGN | F_BLANK)))
14777 {
14778 /* Fast path */
14779 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14780 return -1;
14781 return 0;
14782 }
14783
14784 /* Truncate the string for "s", "r" and "a" formats
14785 if the precision is set */
14786 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14787 if (arg->prec >= 0 && len > arg->prec)
14788 len = arg->prec;
14789 }
14790
14791 /* Adjust sign and width */
14792 kind = PyUnicode_KIND(str);
14793 pbuf = PyUnicode_DATA(str);
14794 pindex = 0;
14795 signchar = '\0';
14796 if (arg->sign) {
14797 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14798 if (ch == '-' || ch == '+') {
14799 signchar = ch;
14800 len--;
14801 pindex++;
14802 }
14803 else if (arg->flags & F_SIGN)
14804 signchar = '+';
14805 else if (arg->flags & F_BLANK)
14806 signchar = ' ';
14807 else
14808 arg->sign = 0;
14809 }
14810 if (arg->width < len)
14811 arg->width = len;
14812
14813 /* Prepare the writer */
14814 maxchar = writer->maxchar;
14815 if (!(arg->flags & F_LJUST)) {
14816 if (arg->sign) {
14817 if ((arg->width-1) > len)
14818 maxchar = Py_MAX(maxchar, fill);
14819 }
14820 else {
14821 if (arg->width > len)
14822 maxchar = Py_MAX(maxchar, fill);
14823 }
14824 }
14825 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14826 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14827 maxchar = Py_MAX(maxchar, strmaxchar);
14828 }
14829
14830 buflen = arg->width;
14831 if (arg->sign && len == arg->width)
14832 buflen++;
14833 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14834 return -1;
14835
14836 /* Write the sign if needed */
14837 if (arg->sign) {
14838 if (fill != ' ') {
14839 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14840 writer->pos += 1;
14841 }
14842 if (arg->width > len)
14843 arg->width--;
14844 }
14845
14846 /* Write the numeric prefix for "x", "X" and "o" formats
14847 if the alternate form is used.
14848 For example, write "0x" for the "%#x" format. */
14849 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14850 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14851 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14852 if (fill != ' ') {
14853 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14854 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14855 writer->pos += 2;
14856 pindex += 2;
14857 }
14858 arg->width -= 2;
14859 if (arg->width < 0)
14860 arg->width = 0;
14861 len -= 2;
14862 }
14863
14864 /* Pad left with the fill character if needed */
14865 if (arg->width > len && !(arg->flags & F_LJUST)) {
14866 sublen = arg->width - len;
14867 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14868 writer->pos += sublen;
14869 arg->width = len;
14870 }
14871
14872 /* If padding with spaces: write sign if needed and/or numeric prefix if
14873 the alternate form is used */
14874 if (fill == ' ') {
14875 if (arg->sign) {
14876 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14877 writer->pos += 1;
14878 }
14879 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14880 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14881 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14882 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14883 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14884 writer->pos += 2;
14885 pindex += 2;
14886 }
14887 }
14888
14889 /* Write characters */
14890 if (len) {
14891 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14892 str, pindex, len);
14893 writer->pos += len;
14894 }
14895
14896 /* Pad right with the fill character if needed */
14897 if (arg->width > len) {
14898 sublen = arg->width - len;
14899 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14900 writer->pos += sublen;
14901 }
14902 return 0;
14903}
14904
14905/* Helper of PyUnicode_Format(): format one arg.
14906 Return 0 on success, raise an exception and return -1 on error. */
14907static int
14908unicode_format_arg(struct unicode_formatter_t *ctx)
14909{
14910 struct unicode_format_arg_t arg;
14911 PyObject *str;
14912 int ret;
14913
14914 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14915 if (arg.ch == '%') {
14916 ctx->fmtpos++;
14917 ctx->fmtcnt--;
14918 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14919 return -1;
14920 return 0;
14921 }
14922 arg.flags = 0;
14923 arg.width = -1;
14924 arg.prec = -1;
14925 arg.sign = 0;
14926 str = NULL;
14927
14928 ret = unicode_format_arg_parse(ctx, &arg);
14929 if (ret == -1)
14930 return -1;
14931
14932 ret = unicode_format_arg_format(ctx, &arg, &str);
14933 if (ret == -1)
14934 return -1;
14935
14936 if (ret != 1) {
14937 ret = unicode_format_arg_output(ctx, &arg, str);
14938 Py_DECREF(str);
14939 if (ret == -1)
14940 return -1;
14941 }
14942
14943 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14944 PyErr_SetString(PyExc_TypeError,
14945 "not all arguments converted during string formatting");
14946 return -1;
14947 }
14948 return 0;
14949}
14950
14951PyObject *
14952PyUnicode_Format(PyObject *format, PyObject *args)
14953{
14954 struct unicode_formatter_t ctx;
14955
14956 if (format == NULL || args == NULL) {
14957 PyErr_BadInternalCall();
14958 return NULL;
14959 }
14960
14961 if (ensure_unicode(format) < 0)
14962 return NULL;
14963
14964 ctx.fmtstr = format;
14965 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14966 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14967 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14968 ctx.fmtpos = 0;
14969
14970 _PyUnicodeWriter_Init(&ctx.writer);
14971 ctx.writer.min_length = ctx.fmtcnt + 100;
14972 ctx.writer.overallocate = 1;
14973
14974 if (PyTuple_Check(args)) {
14975 ctx.arglen = PyTuple_Size(args);
14976 ctx.argidx = 0;
14977 }
14978 else {
14979 ctx.arglen = -1;
14980 ctx.argidx = -2;
14981 }
14982 ctx.args_owned = 0;
14983 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14984 ctx.dict = args;
14985 else
14986 ctx.dict = NULL;
14987 ctx.args = args;
14988
14989 while (--ctx.fmtcnt >= 0) {
14990 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14991 Py_ssize_t nonfmtpos;
14992
14993 nonfmtpos = ctx.fmtpos++;
14994 while (ctx.fmtcnt >= 0 &&
14995 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14996 ctx.fmtpos++;
14997 ctx.fmtcnt--;
14998 }
14999 if (ctx.fmtcnt < 0) {
15000 ctx.fmtpos--;
15001 ctx.writer.overallocate = 0;
15002 }
15003
15004 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15005 nonfmtpos, ctx.fmtpos) < 0)
15006 goto onError;
15007 }
15008 else {
15009 ctx.fmtpos++;
15010 if (unicode_format_arg(&ctx) == -1)
15011 goto onError;
15012 }
15013 }
15014
15015 if (ctx.argidx < ctx.arglen && !ctx.dict) {
15016 PyErr_SetString(PyExc_TypeError,
15017 "not all arguments converted during string formatting");
15018 goto onError;
15019 }
15020
15021 if (ctx.args_owned) {
15022 Py_DECREF(ctx.args);
15023 }
15024 return _PyUnicodeWriter_Finish(&ctx.writer);
15025
15026 onError:
15027 _PyUnicodeWriter_Dealloc(&ctx.writer);
15028 if (ctx.args_owned) {
15029 Py_DECREF(ctx.args);
15030 }
15031 return NULL;
15032}
15033
15034static PyObject *
15035unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15036
15037static PyObject *
15038unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15039{
15040 PyObject *x = NULL;
15041 static char *kwlist[] = {"object", "encoding", "errors", 0};
15042 char *encoding = NULL;
15043 char *errors = NULL;
15044
15045 if (type != &PyUnicode_Type)
15046 return unicode_subtype_new(type, args, kwds);
15047 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15048 kwlist, &x, &encoding, &errors))
15049 return NULL;
15050 if (x == NULL)
15051 _Py_RETURN_UNICODE_EMPTY();
15052 if (encoding == NULL && errors == NULL)
15053 return PyObject_Str(x);
15054 else
15055 return PyUnicode_FromEncodedObject(x, encoding, errors);
15056}
15057
15058static PyObject *
15059unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15060{
15061 PyObject *unicode, *self;
15062 Py_ssize_t length, char_size;
15063 int share_wstr, share_utf8;
15064 unsigned int kind;
15065 void *data;
15066
15067 assert(PyType_IsSubtype(type, &PyUnicode_Type));
15068
15069 unicode = unicode_new(&PyUnicode_Type, args, kwds);
15070 if (unicode == NULL)
15071 return NULL;
15072 assert(_PyUnicode_CHECK(unicode));
15073 if (PyUnicode_READY(unicode) == -1) {
15074 Py_DECREF(unicode);
15075 return NULL;
15076 }
15077
15078 self = type->tp_alloc(type, 0);
15079 if (self == NULL) {
15080 Py_DECREF(unicode);
15081 return NULL;
15082 }
15083 kind = PyUnicode_KIND(unicode);
15084 length = PyUnicode_GET_LENGTH(unicode);
15085
15086 _PyUnicode_LENGTH(self) = length;
15087#ifdef Py_DEBUG
15088 _PyUnicode_HASH(self) = -1;
15089#else
15090 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15091#endif
15092 _PyUnicode_STATE(self).interned = 0;
15093 _PyUnicode_STATE(self).kind = kind;
15094 _PyUnicode_STATE(self).compact = 0;
15095 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15096 _PyUnicode_STATE(self).ready = 1;
15097 _PyUnicode_WSTR(self) = NULL;
15098 _PyUnicode_UTF8_LENGTH(self) = 0;
15099 _PyUnicode_UTF8(self) = NULL;
15100 _PyUnicode_WSTR_LENGTH(self) = 0;
15101 _PyUnicode_DATA_ANY(self) = NULL;
15102
15103 share_utf8 = 0;
15104 share_wstr = 0;
15105 if (kind == PyUnicode_1BYTE_KIND) {
15106 char_size = 1;
15107 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15108 share_utf8 = 1;
15109 }
15110 else if (kind == PyUnicode_2BYTE_KIND) {
15111 char_size = 2;
15112 if (sizeof(wchar_t) == 2)
15113 share_wstr = 1;
15114 }
15115 else {
15116 assert(kind == PyUnicode_4BYTE_KIND);
15117 char_size = 4;
15118 if (sizeof(wchar_t) == 4)
15119 share_wstr = 1;
15120 }
15121
15122 /* Ensure we won't overflow the length. */
15123 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15124 PyErr_NoMemory();
15125 goto onError;
15126 }
15127 data = PyObject_MALLOC((length + 1) * char_size);
15128 if (data == NULL) {
15129 PyErr_NoMemory();
15130 goto onError;
15131 }
15132
15133 _PyUnicode_DATA_ANY(self) = data;
15134 if (share_utf8) {
15135 _PyUnicode_UTF8_LENGTH(self) = length;
15136 _PyUnicode_UTF8(self) = data;
15137 }
15138 if (share_wstr) {
15139 _PyUnicode_WSTR_LENGTH(self) = length;
15140 _PyUnicode_WSTR(self) = (wchar_t *)data;
15141 }
15142
15143 memcpy(data, PyUnicode_DATA(unicode),
15144 kind * (length + 1));
15145 assert(_PyUnicode_CheckConsistency(self, 1));
15146#ifdef Py_DEBUG
15147 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15148#endif
15149 Py_DECREF(unicode);
15150 return self;
15151
15152onError:
15153 Py_DECREF(unicode);
15154 Py_DECREF(self);
15155 return NULL;
15156}
15157
15158PyDoc_STRVAR(unicode_doc,
15159"str(object='') -> str\n\
15160str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15161\n\
15162Create a new string object from the given object. If encoding or\n\
15163errors is specified, then the object must expose a data buffer\n\
15164that will be decoded using the given encoding and error handler.\n\
15165Otherwise, returns the result of object.__str__() (if defined)\n\
15166or repr(object).\n\
15167encoding defaults to sys.getdefaultencoding().\n\
15168errors defaults to 'strict'.");
15169
15170static PyObject *unicode_iter(PyObject *seq);
15171
15172PyTypeObject PyUnicode_Type = {
15173 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15174 "str", /* tp_name */
15175 sizeof(PyUnicodeObject), /* tp_size */
15176 0, /* tp_itemsize */
15177 /* Slots */
15178 (destructor)unicode_dealloc, /* tp_dealloc */
15179 0, /* tp_print */
15180 0, /* tp_getattr */
15181 0, /* tp_setattr */
15182 0, /* tp_reserved */
15183 unicode_repr, /* tp_repr */
15184 &unicode_as_number, /* tp_as_number */
15185 &unicode_as_sequence, /* tp_as_sequence */
15186 &unicode_as_mapping, /* tp_as_mapping */
15187 (hashfunc) unicode_hash, /* tp_hash*/
15188 0, /* tp_call*/
15189 (reprfunc) unicode_str, /* tp_str */
15190 PyObject_GenericGetAttr, /* tp_getattro */
15191 0, /* tp_setattro */
15192 0, /* tp_as_buffer */
15193 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15194 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15195 unicode_doc, /* tp_doc */
15196 0, /* tp_traverse */
15197 0, /* tp_clear */
15198 PyUnicode_RichCompare, /* tp_richcompare */
15199 0, /* tp_weaklistoffset */
15200 unicode_iter, /* tp_iter */
15201 0, /* tp_iternext */
15202 unicode_methods, /* tp_methods */
15203 0, /* tp_members */
15204 0, /* tp_getset */
15205 &PyBaseObject_Type, /* tp_base */
15206 0, /* tp_dict */
15207 0, /* tp_descr_get */
15208 0, /* tp_descr_set */
15209 0, /* tp_dictoffset */
15210 0, /* tp_init */
15211 0, /* tp_alloc */
15212 unicode_new, /* tp_new */
15213 PyObject_Del, /* tp_free */
15214};
15215
15216/* Initialize the Unicode implementation */
15217
15218int _PyUnicode_Init(void)
15219{
15220 /* XXX - move this array to unicodectype.c ? */
15221 Py_UCS2 linebreak[] = {
15222 0x000A, /* LINE FEED */
15223 0x000D, /* CARRIAGE RETURN */
15224 0x001C, /* FILE SEPARATOR */
15225 0x001D, /* GROUP SEPARATOR */
15226 0x001E, /* RECORD SEPARATOR */
15227 0x0085, /* NEXT LINE */
15228 0x2028, /* LINE SEPARATOR */
15229 0x2029, /* PARAGRAPH SEPARATOR */
15230 };
15231
15232 /* Init the implementation */
15233 _Py_INCREF_UNICODE_EMPTY();
15234 if (!unicode_empty)
15235 Py_FatalError("Can't create empty string");
15236 Py_DECREF(unicode_empty);
15237
15238 if (PyType_Ready(&PyUnicode_Type) < 0)
15239 Py_FatalError("Can't initialize 'unicode'");
15240
15241 /* initialize the linebreak bloom filter */
15242 bloom_linebreak = make_bloom_mask(
15243 PyUnicode_2BYTE_KIND, linebreak,
15244 Py_ARRAY_LENGTH(linebreak));
15245
15246 if (PyType_Ready(&EncodingMapType) < 0)
15247 Py_FatalError("Can't initialize encoding map type");
15248
15249 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15250 Py_FatalError("Can't initialize field name iterator type");
15251
15252 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15253 Py_FatalError("Can't initialize formatter iter type");
15254
15255 return 0;
15256}
15257
15258/* Finalize the Unicode implementation */
15259
15260int
15261PyUnicode_ClearFreeList(void)
15262{
15263 return 0;
15264}
15265
15266void
15267_PyUnicode_Fini(void)
15268{
15269 int i;
15270
15271 Py_CLEAR(unicode_empty);
15272
15273 for (i = 0; i < 256; i++)
15274 Py_CLEAR(unicode_latin1[i]);
15275 _PyUnicode_ClearStaticStrings();
15276 (void)PyUnicode_ClearFreeList();
15277}
15278
15279void
15280PyUnicode_InternInPlace(PyObject **p)
15281{
15282 PyObject *s = *p;
15283 PyObject *t;
15284#ifdef Py_DEBUG
15285 assert(s != NULL);
15286 assert(_PyUnicode_CHECK(s));
15287#else
15288 if (s == NULL || !PyUnicode_Check(s))
15289 return;
15290#endif
15291 /* If it's a subclass, we don't really know what putting
15292 it in the interned dict might do. */
15293 if (!PyUnicode_CheckExact(s))
15294 return;
15295 if (PyUnicode_CHECK_INTERNED(s))
15296 return;
15297 if (interned == NULL) {
15298 interned = PyDict_New();
15299 if (interned == NULL) {
15300 PyErr_Clear(); /* Don't leave an exception */
15301 return;
15302 }
15303 }
15304 Py_ALLOW_RECURSION
15305 t = PyDict_SetDefault(interned, s, s);
15306 Py_END_ALLOW_RECURSION
15307 if (t == NULL) {
15308 PyErr_Clear();
15309 return;
15310 }
15311 if (t != s) {
15312 Py_INCREF(t);
15313 Py_SETREF(*p, t);
15314 return;
15315 }
15316 /* The two references in interned are not counted by refcnt.
15317 The deallocator will take care of this */
15318 Py_REFCNT(s) -= 2;
15319 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15320}
15321
15322void
15323PyUnicode_InternImmortal(PyObject **p)
15324{
15325 PyUnicode_InternInPlace(p);
15326 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15327 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15328 Py_INCREF(*p);
15329 }
15330}
15331
15332PyObject *
15333PyUnicode_InternFromString(const char *cp)
15334{
15335 PyObject *s = PyUnicode_FromString(cp);
15336 if (s == NULL)
15337 return NULL;
15338 PyUnicode_InternInPlace(&s);
15339 return s;
15340}
15341
15342void
15343_Py_ReleaseInternedUnicodeStrings(void)
15344{
15345 PyObject *keys;
15346 PyObject *s;
15347 Py_ssize_t i, n;
15348 Py_ssize_t immortal_size = 0, mortal_size = 0;
15349
15350 if (interned == NULL || !PyDict_Check(interned))
15351 return;
15352 keys = PyDict_Keys(interned);
15353 if (keys == NULL || !PyList_Check(keys)) {
15354 PyErr_Clear();
15355 return;
15356 }
15357
15358 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15359 detector, interned unicode strings are not forcibly deallocated;
15360 rather, we give them their stolen references back, and then clear
15361 and DECREF the interned dict. */
15362
15363 n = PyList_GET_SIZE(keys);
15364 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15365 n);
15366 for (i = 0; i < n; i++) {
15367 s = PyList_GET_ITEM(keys, i);
15368 if (PyUnicode_READY(s) == -1) {
15369 assert(0 && "could not ready string");
15370 fprintf(stderr, "could not ready string\n");
15371 }
15372 switch (PyUnicode_CHECK_INTERNED(s)) {
15373 case SSTATE_NOT_INTERNED:
15374 /* XXX Shouldn't happen */
15375 break;
15376 case SSTATE_INTERNED_IMMORTAL:
15377 Py_REFCNT(s) += 1;
15378 immortal_size += PyUnicode_GET_LENGTH(s);
15379 break;
15380 case SSTATE_INTERNED_MORTAL:
15381 Py_REFCNT(s) += 2;
15382 mortal_size += PyUnicode_GET_LENGTH(s);
15383 break;
15384 default:
15385 Py_FatalError("Inconsistent interned string state.");
15386 }
15387 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15388 }
15389 fprintf(stderr, "total size of all interned strings: "
15390 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15391 "mortal/immortal\n", mortal_size, immortal_size);
15392 Py_DECREF(keys);
15393 PyDict_Clear(interned);
15394 Py_CLEAR(interned);
15395}
15396
15397
15398/********************* Unicode Iterator **************************/
15399
15400typedef struct {
15401 PyObject_HEAD
15402 Py_ssize_t it_index;
15403 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15404} unicodeiterobject;
15405
15406static void
15407unicodeiter_dealloc(unicodeiterobject *it)
15408{
15409 _PyObject_GC_UNTRACK(it);
15410 Py_XDECREF(it->it_seq);
15411 PyObject_GC_Del(it);
15412}
15413
15414static int
15415unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15416{
15417 Py_VISIT(it->it_seq);
15418 return 0;
15419}
15420
15421static PyObject *
15422unicodeiter_next(unicodeiterobject *it)
15423{
15424 PyObject *seq, *item;
15425
15426 assert(it != NULL);
15427 seq = it->it_seq;
15428 if (seq == NULL)
15429 return NULL;
15430 assert(_PyUnicode_CHECK(seq));
15431
15432 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15433 int kind = PyUnicode_KIND(seq);
15434 void *data = PyUnicode_DATA(seq);
15435 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15436 item = PyUnicode_FromOrdinal(chr);
15437 if (item != NULL)
15438 ++it->it_index;
15439 return item;
15440 }
15441
15442 it->it_seq = NULL;
15443 Py_DECREF(seq);
15444 return NULL;
15445}
15446
15447static PyObject *
15448unicodeiter_len(unicodeiterobject *it)
15449{
15450 Py_ssize_t len = 0;
15451 if (it->it_seq)
15452 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15453 return PyLong_FromSsize_t(len);
15454}
15455
15456PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15457
15458static PyObject *
15459unicodeiter_reduce(unicodeiterobject *it)
15460{
15461 if (it->it_seq != NULL) {
15462 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15463 it->it_seq, it->it_index);
15464 } else {
15465 PyObject *u = (PyObject *)_PyUnicode_New(0);
15466 if (u == NULL)
15467 return NULL;
15468 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15469 }
15470}
15471
15472PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15473
15474static PyObject *
15475unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15476{
15477 Py_ssize_t index = PyLong_AsSsize_t(state);
15478 if (index == -1 && PyErr_Occurred())
15479 return NULL;
15480 if (it->it_seq != NULL) {
15481 if (index < 0)
15482 index = 0;
15483 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15484 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15485 it->it_index = index;
15486 }
15487 Py_RETURN_NONE;
15488}
15489
15490PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15491
15492static PyMethodDef unicodeiter_methods[] = {
15493 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15494 length_hint_doc},
15495 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15496 reduce_doc},
15497 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15498 setstate_doc},
15499 {NULL, NULL} /* sentinel */
15500};
15501
15502PyTypeObject PyUnicodeIter_Type = {
15503 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15504 "str_iterator", /* tp_name */
15505 sizeof(unicodeiterobject), /* tp_basicsize */
15506 0, /* tp_itemsize */
15507 /* methods */
15508 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15509 0, /* tp_print */
15510 0, /* tp_getattr */
15511 0, /* tp_setattr */
15512 0, /* tp_reserved */
15513 0, /* tp_repr */
15514 0, /* tp_as_number */
15515 0, /* tp_as_sequence */
15516 0, /* tp_as_mapping */
15517 0, /* tp_hash */
15518 0, /* tp_call */
15519 0, /* tp_str */
15520 PyObject_GenericGetAttr, /* tp_getattro */
15521 0, /* tp_setattro */
15522 0, /* tp_as_buffer */
15523 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15524 0, /* tp_doc */
15525 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15526 0, /* tp_clear */
15527 0, /* tp_richcompare */
15528 0, /* tp_weaklistoffset */
15529 PyObject_SelfIter, /* tp_iter */
15530 (iternextfunc)unicodeiter_next, /* tp_iternext */
15531 unicodeiter_methods, /* tp_methods */
15532 0,
15533};
15534
15535static PyObject *
15536unicode_iter(PyObject *seq)
15537{
15538 unicodeiterobject *it;
15539
15540 if (!PyUnicode_Check(seq)) {
15541 PyErr_BadInternalCall();
15542 return NULL;
15543 }
15544 if (PyUnicode_READY(seq) == -1)
15545 return NULL;
15546 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15547 if (it == NULL)
15548 return NULL;
15549 it->it_index = 0;
15550 Py_INCREF(seq);
15551 it->it_seq = seq;
15552 _PyObject_GC_TRACK(it);
15553 return (PyObject *)it;
15554}
15555
15556
15557size_t
15558Py_UNICODE_strlen(const Py_UNICODE *u)
15559{
15560 return wcslen(u);
15561}
15562
15563Py_UNICODE*
15564Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15565{
15566 Py_UNICODE *u = s1;
15567 while ((*u++ = *s2++));
15568 return s1;
15569}
15570
15571Py_UNICODE*
15572Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15573{
15574 Py_UNICODE *u = s1;
15575 while ((*u++ = *s2++))
15576 if (n-- == 0)
15577 break;
15578 return s1;
15579}
15580
15581Py_UNICODE*
15582Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15583{
15584 Py_UNICODE *u1 = s1;
15585 u1 += wcslen(u1);
15586 while ((*u1++ = *s2++));
15587 return s1;
15588}
15589
15590int
15591Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15592{
15593 while (*s1 && *s2 && *s1 == *s2)
15594 s1++, s2++;
15595 if (*s1 && *s2)
15596 return (*s1 < *s2) ? -1 : +1;
15597 if (*s1)
15598 return 1;
15599 if (*s2)
15600 return -1;
15601 return 0;
15602}
15603
15604int
15605Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15606{
15607 Py_UNICODE u1, u2;
15608 for (; n != 0; n--) {
15609 u1 = *s1;
15610 u2 = *s2;
15611 if (u1 != u2)
15612 return (u1 < u2) ? -1 : +1;
15613 if (u1 == '\0')
15614 return 0;
15615 s1++;
15616 s2++;
15617 }
15618 return 0;
15619}
15620
15621Py_UNICODE*
15622Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15623{
15624 const Py_UNICODE *p;
15625 for (p = s; *p; p++)
15626 if (*p == c)
15627 return (Py_UNICODE*)p;
15628 return NULL;
15629}
15630
15631Py_UNICODE*
15632Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15633{
15634 const Py_UNICODE *p;
15635 p = s + wcslen(s);
15636 while (p != s) {
15637 p--;
15638 if (*p == c)
15639 return (Py_UNICODE*)p;
15640 }
15641 return NULL;
15642}
15643
15644Py_UNICODE*
15645PyUnicode_AsUnicodeCopy(PyObject *unicode)
15646{
15647 Py_UNICODE *u, *copy;
15648 Py_ssize_t len, size;
15649
15650 if (!PyUnicode_Check(unicode)) {
15651 PyErr_BadArgument();
15652 return NULL;
15653 }
15654 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15655 if (u == NULL)
15656 return NULL;
15657 /* Ensure we won't overflow the size. */
15658 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15659 PyErr_NoMemory();
15660 return NULL;
15661 }
15662 size = len + 1; /* copy the null character */
15663 size *= sizeof(Py_UNICODE);
15664 copy = PyMem_Malloc(size);
15665 if (copy == NULL) {
15666 PyErr_NoMemory();
15667 return NULL;
15668 }
15669 memcpy(copy, u, size);
15670 return copy;
15671}
15672
15673/* A _string module, to export formatter_parser and formatter_field_name_split
15674 to the string.Formatter class implemented in Python. */
15675
15676static PyMethodDef _string_methods[] = {
15677 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15678 METH_O, PyDoc_STR("split the argument as a field name")},
15679 {"formatter_parser", (PyCFunction) formatter_parser,
15680 METH_O, PyDoc_STR("parse the argument as a format string")},
15681 {NULL, NULL}
15682};
15683
15684static struct PyModuleDef _string_module = {
15685 PyModuleDef_HEAD_INIT,
15686 "_string",
15687 PyDoc_STR("string helper module"),
15688 0,
15689 _string_methods,
15690 NULL,
15691 NULL,
15692 NULL,
15693 NULL
15694};
15695
15696PyMODINIT_FUNC
15697PyInit__string(void)
15698{
15699 return PyModule_Create(&_string_module);
15700}
15701
15702
15703#ifdef __cplusplus
15704}
15705#endif
15706